diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,135140 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 9650505, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00015543228048687607, + "grad_norm": 9.254081726074219, + "learning_rate": 4.999740946199189e-05, + "loss": 3.9557, + "step": 500 + }, + { + "epoch": 0.00031086456097375215, + "grad_norm": 7.8695173263549805, + "learning_rate": 4.999481892398377e-05, + "loss": 3.8625, + "step": 1000 + }, + { + "epoch": 0.0004662968414606282, + "grad_norm": 6.370519161224365, + "learning_rate": 4.999222838597566e-05, + "loss": 3.8333, + "step": 1500 + }, + { + "epoch": 0.0006217291219475043, + "grad_norm": 6.30256986618042, + "learning_rate": 4.9989637847967544e-05, + "loss": 3.761, + "step": 2000 + }, + { + "epoch": 0.0007771614024343804, + "grad_norm": 7.06025505065918, + "learning_rate": 4.998704730995943e-05, + "loss": 3.769, + "step": 2500 + }, + { + "epoch": 0.0009325936829212564, + "grad_norm": 12.265380859375, + "learning_rate": 4.998445677195132e-05, + "loss": 3.7099, + "step": 3000 + }, + { + "epoch": 0.0010880259634081326, + "grad_norm": 8.03703498840332, + "learning_rate": 4.99818662339432e-05, + "loss": 3.7121, + "step": 3500 + }, + { + "epoch": 0.0012434582438950086, + "grad_norm": 6.960049152374268, + "learning_rate": 4.9979275695935086e-05, + "loss": 3.6633, + "step": 4000 + }, + { + "epoch": 0.0013988905243818848, + "grad_norm": 8.182839393615723, + "learning_rate": 4.997668515792697e-05, + "loss": 3.6785, + "step": 4500 + }, + { + "epoch": 0.0015543228048687607, + "grad_norm": 6.969526767730713, + "learning_rate": 4.9974094619918854e-05, + "loss": 3.6416, + "step": 5000 + }, + { + "epoch": 0.001709755085355637, + "grad_norm": 7.408444881439209, + "learning_rate": 4.997150408191074e-05, + "loss": 3.619, + "step": 5500 + }, + { + "epoch": 0.0018651873658425129, + "grad_norm": 7.793248176574707, + "learning_rate": 4.996891354390263e-05, + "loss": 3.6227, + "step": 6000 + }, + { + "epoch": 0.002020619646329389, + "grad_norm": 6.6545281410217285, + "learning_rate": 4.996632300589451e-05, + "loss": 3.6432, + "step": 6500 + }, + { + "epoch": 0.0021760519268162653, + "grad_norm": 6.5722880363464355, + "learning_rate": 4.9963732467886395e-05, + "loss": 3.5936, + "step": 7000 + }, + { + "epoch": 0.0023314842073031412, + "grad_norm": 8.818721771240234, + "learning_rate": 4.996114192987828e-05, + "loss": 3.6193, + "step": 7500 + }, + { + "epoch": 0.002486916487790017, + "grad_norm": 6.086054801940918, + "learning_rate": 4.995855139187017e-05, + "loss": 3.5765, + "step": 8000 + }, + { + "epoch": 0.002642348768276893, + "grad_norm": 5.906280517578125, + "learning_rate": 4.995596085386206e-05, + "loss": 3.5696, + "step": 8500 + }, + { + "epoch": 0.0027977810487637696, + "grad_norm": 8.405414581298828, + "learning_rate": 4.9953370315853944e-05, + "loss": 3.6046, + "step": 9000 + }, + { + "epoch": 0.0029532133292506455, + "grad_norm": 6.125730991363525, + "learning_rate": 4.9950779777845824e-05, + "loss": 3.5812, + "step": 9500 + }, + { + "epoch": 0.0031086456097375215, + "grad_norm": 6.632938385009766, + "learning_rate": 4.994818923983771e-05, + "loss": 3.5818, + "step": 10000 + }, + { + "epoch": 0.0032640778902243974, + "grad_norm": 6.575578212738037, + "learning_rate": 4.994559870182959e-05, + "loss": 3.6066, + "step": 10500 + }, + { + "epoch": 0.003419510170711274, + "grad_norm": 8.715726852416992, + "learning_rate": 4.994300816382148e-05, + "loss": 3.5446, + "step": 11000 + }, + { + "epoch": 0.00357494245119815, + "grad_norm": 9.213972091674805, + "learning_rate": 4.9940417625813366e-05, + "loss": 3.5449, + "step": 11500 + }, + { + "epoch": 0.0037303747316850258, + "grad_norm": 7.966500282287598, + "learning_rate": 4.9937827087805253e-05, + "loss": 3.5598, + "step": 12000 + }, + { + "epoch": 0.0038858070121719017, + "grad_norm": 8.011313438415527, + "learning_rate": 4.993523654979714e-05, + "loss": 3.5448, + "step": 12500 + }, + { + "epoch": 0.004041239292658778, + "grad_norm": 6.3176116943359375, + "learning_rate": 4.993264601178903e-05, + "loss": 3.5389, + "step": 13000 + }, + { + "epoch": 0.004196671573145654, + "grad_norm": 4.929056644439697, + "learning_rate": 4.993005547378091e-05, + "loss": 3.5184, + "step": 13500 + }, + { + "epoch": 0.0043521038536325305, + "grad_norm": 7.387260437011719, + "learning_rate": 4.9927464935772795e-05, + "loss": 3.5385, + "step": 14000 + }, + { + "epoch": 0.004507536134119406, + "grad_norm": 6.908529758453369, + "learning_rate": 4.992487439776468e-05, + "loss": 3.5099, + "step": 14500 + }, + { + "epoch": 0.0046629684146062824, + "grad_norm": 6.51350736618042, + "learning_rate": 4.992228385975656e-05, + "loss": 3.5076, + "step": 15000 + }, + { + "epoch": 0.004818400695093158, + "grad_norm": 9.350944519042969, + "learning_rate": 4.991969332174845e-05, + "loss": 3.5341, + "step": 15500 + }, + { + "epoch": 0.004973832975580034, + "grad_norm": 7.53931188583374, + "learning_rate": 4.991710278374033e-05, + "loss": 3.4809, + "step": 16000 + }, + { + "epoch": 0.005129265256066911, + "grad_norm": 8.150294303894043, + "learning_rate": 4.991451224573222e-05, + "loss": 3.4735, + "step": 16500 + }, + { + "epoch": 0.005284697536553786, + "grad_norm": 6.0684638023376465, + "learning_rate": 4.9911921707724105e-05, + "loss": 3.5093, + "step": 17000 + }, + { + "epoch": 0.005440129817040663, + "grad_norm": 7.786524295806885, + "learning_rate": 4.990933116971599e-05, + "loss": 3.4568, + "step": 17500 + }, + { + "epoch": 0.005595562097527539, + "grad_norm": 5.70767879486084, + "learning_rate": 4.990674063170788e-05, + "loss": 3.4764, + "step": 18000 + }, + { + "epoch": 0.005750994378014415, + "grad_norm": 6.528026103973389, + "learning_rate": 4.9904150093699766e-05, + "loss": 3.474, + "step": 18500 + }, + { + "epoch": 0.005906426658501291, + "grad_norm": 6.154341220855713, + "learning_rate": 4.9901559555691646e-05, + "loss": 3.4715, + "step": 19000 + }, + { + "epoch": 0.0060618589389881666, + "grad_norm": 6.200184345245361, + "learning_rate": 4.9898969017683534e-05, + "loss": 3.4699, + "step": 19500 + }, + { + "epoch": 0.006217291219475043, + "grad_norm": 11.773934364318848, + "learning_rate": 4.989637847967542e-05, + "loss": 3.4729, + "step": 20000 + }, + { + "epoch": 0.006372723499961919, + "grad_norm": 7.110173225402832, + "learning_rate": 4.98937879416673e-05, + "loss": 3.4361, + "step": 20500 + }, + { + "epoch": 0.006528155780448795, + "grad_norm": 6.901799201965332, + "learning_rate": 4.989119740365919e-05, + "loss": 3.4489, + "step": 21000 + }, + { + "epoch": 0.006683588060935671, + "grad_norm": 7.096058368682861, + "learning_rate": 4.988860686565107e-05, + "loss": 3.4503, + "step": 21500 + }, + { + "epoch": 0.006839020341422548, + "grad_norm": 9.403504371643066, + "learning_rate": 4.988601632764296e-05, + "loss": 3.4693, + "step": 22000 + }, + { + "epoch": 0.006994452621909423, + "grad_norm": 7.4532976150512695, + "learning_rate": 4.988342578963485e-05, + "loss": 3.4385, + "step": 22500 + }, + { + "epoch": 0.0071498849023963, + "grad_norm": 6.7622904777526855, + "learning_rate": 4.988083525162673e-05, + "loss": 3.4319, + "step": 23000 + }, + { + "epoch": 0.007305317182883175, + "grad_norm": 7.09910249710083, + "learning_rate": 4.987824471361862e-05, + "loss": 3.4506, + "step": 23500 + }, + { + "epoch": 0.0074607494633700516, + "grad_norm": 6.877001762390137, + "learning_rate": 4.9875654175610504e-05, + "loss": 3.4281, + "step": 24000 + }, + { + "epoch": 0.007616181743856928, + "grad_norm": 9.011584281921387, + "learning_rate": 4.9873063637602385e-05, + "loss": 3.4442, + "step": 24500 + }, + { + "epoch": 0.0077716140243438035, + "grad_norm": 6.0322675704956055, + "learning_rate": 4.987047309959427e-05, + "loss": 3.4612, + "step": 25000 + }, + { + "epoch": 0.00792704630483068, + "grad_norm": 7.526337146759033, + "learning_rate": 4.986788256158616e-05, + "loss": 3.4413, + "step": 25500 + }, + { + "epoch": 0.008082478585317555, + "grad_norm": 6.7198262214660645, + "learning_rate": 4.986529202357804e-05, + "loss": 3.4605, + "step": 26000 + }, + { + "epoch": 0.008237910865804433, + "grad_norm": 8.762670516967773, + "learning_rate": 4.986270148556993e-05, + "loss": 3.4418, + "step": 26500 + }, + { + "epoch": 0.008393343146291308, + "grad_norm": 6.379214763641357, + "learning_rate": 4.9860110947561814e-05, + "loss": 3.4188, + "step": 27000 + }, + { + "epoch": 0.008548775426778184, + "grad_norm": 5.548459529876709, + "learning_rate": 4.98575204095537e-05, + "loss": 3.382, + "step": 27500 + }, + { + "epoch": 0.008704207707265061, + "grad_norm": 7.0447492599487305, + "learning_rate": 4.985492987154559e-05, + "loss": 3.3932, + "step": 28000 + }, + { + "epoch": 0.008859639987751937, + "grad_norm": 7.93557596206665, + "learning_rate": 4.985233933353747e-05, + "loss": 3.4208, + "step": 28500 + }, + { + "epoch": 0.009015072268238812, + "grad_norm": 8.870244026184082, + "learning_rate": 4.9849748795529356e-05, + "loss": 3.4337, + "step": 29000 + }, + { + "epoch": 0.009170504548725688, + "grad_norm": 6.678338527679443, + "learning_rate": 4.984715825752124e-05, + "loss": 3.4042, + "step": 29500 + }, + { + "epoch": 0.009325936829212565, + "grad_norm": 7.443600177764893, + "learning_rate": 4.984456771951312e-05, + "loss": 3.4181, + "step": 30000 + }, + { + "epoch": 0.00948136910969944, + "grad_norm": 6.692905426025391, + "learning_rate": 4.984197718150501e-05, + "loss": 3.4073, + "step": 30500 + }, + { + "epoch": 0.009636801390186316, + "grad_norm": 5.882769584655762, + "learning_rate": 4.98393866434969e-05, + "loss": 3.4098, + "step": 31000 + }, + { + "epoch": 0.009792233670673193, + "grad_norm": 5.080358982086182, + "learning_rate": 4.983679610548878e-05, + "loss": 3.3933, + "step": 31500 + }, + { + "epoch": 0.009947665951160069, + "grad_norm": 7.53864049911499, + "learning_rate": 4.983420556748067e-05, + "loss": 3.3817, + "step": 32000 + }, + { + "epoch": 0.010103098231646944, + "grad_norm": 6.4359259605407715, + "learning_rate": 4.983161502947256e-05, + "loss": 3.4318, + "step": 32500 + }, + { + "epoch": 0.010258530512133822, + "grad_norm": 7.688544273376465, + "learning_rate": 4.982902449146444e-05, + "loss": 3.3895, + "step": 33000 + }, + { + "epoch": 0.010413962792620697, + "grad_norm": 6.904985427856445, + "learning_rate": 4.9826433953456326e-05, + "loss": 3.4207, + "step": 33500 + }, + { + "epoch": 0.010569395073107573, + "grad_norm": 6.890560150146484, + "learning_rate": 4.982384341544821e-05, + "loss": 3.4102, + "step": 34000 + }, + { + "epoch": 0.01072482735359445, + "grad_norm": 7.183640480041504, + "learning_rate": 4.9821252877440094e-05, + "loss": 3.4115, + "step": 34500 + }, + { + "epoch": 0.010880259634081325, + "grad_norm": 6.865628242492676, + "learning_rate": 4.981866233943198e-05, + "loss": 3.4145, + "step": 35000 + }, + { + "epoch": 0.011035691914568201, + "grad_norm": 6.051755428314209, + "learning_rate": 4.981607180142386e-05, + "loss": 3.3975, + "step": 35500 + }, + { + "epoch": 0.011191124195055078, + "grad_norm": 6.527932167053223, + "learning_rate": 4.981348126341575e-05, + "loss": 3.3706, + "step": 36000 + }, + { + "epoch": 0.011346556475541954, + "grad_norm": 6.152163505554199, + "learning_rate": 4.9810890725407636e-05, + "loss": 3.3807, + "step": 36500 + }, + { + "epoch": 0.01150198875602883, + "grad_norm": 8.102402687072754, + "learning_rate": 4.980830018739952e-05, + "loss": 3.3917, + "step": 37000 + }, + { + "epoch": 0.011657421036515705, + "grad_norm": 6.99196195602417, + "learning_rate": 4.980570964939141e-05, + "loss": 3.3759, + "step": 37500 + }, + { + "epoch": 0.011812853317002582, + "grad_norm": 6.066771984100342, + "learning_rate": 4.98031191113833e-05, + "loss": 3.4117, + "step": 38000 + }, + { + "epoch": 0.011968285597489458, + "grad_norm": 5.633359909057617, + "learning_rate": 4.980052857337518e-05, + "loss": 3.3945, + "step": 38500 + }, + { + "epoch": 0.012123717877976333, + "grad_norm": 7.237349987030029, + "learning_rate": 4.9797938035367065e-05, + "loss": 3.3978, + "step": 39000 + }, + { + "epoch": 0.01227915015846321, + "grad_norm": 7.362928867340088, + "learning_rate": 4.9795347497358945e-05, + "loss": 3.374, + "step": 39500 + }, + { + "epoch": 0.012434582438950086, + "grad_norm": 7.534677982330322, + "learning_rate": 4.979275695935083e-05, + "loss": 3.3833, + "step": 40000 + }, + { + "epoch": 0.012590014719436961, + "grad_norm": 7.12122106552124, + "learning_rate": 4.979016642134272e-05, + "loss": 3.3804, + "step": 40500 + }, + { + "epoch": 0.012745446999923839, + "grad_norm": 5.8175764083862305, + "learning_rate": 4.97875758833346e-05, + "loss": 3.3722, + "step": 41000 + }, + { + "epoch": 0.012900879280410714, + "grad_norm": 6.738646507263184, + "learning_rate": 4.978498534532649e-05, + "loss": 3.369, + "step": 41500 + }, + { + "epoch": 0.01305631156089759, + "grad_norm": 7.5022664070129395, + "learning_rate": 4.978239480731838e-05, + "loss": 3.3762, + "step": 42000 + }, + { + "epoch": 0.013211743841384467, + "grad_norm": 6.5979180335998535, + "learning_rate": 4.977980426931026e-05, + "loss": 3.3955, + "step": 42500 + }, + { + "epoch": 0.013367176121871343, + "grad_norm": 6.088146686553955, + "learning_rate": 4.977721373130215e-05, + "loss": 3.3997, + "step": 43000 + }, + { + "epoch": 0.013522608402358218, + "grad_norm": 8.230671882629395, + "learning_rate": 4.9774623193294036e-05, + "loss": 3.3691, + "step": 43500 + }, + { + "epoch": 0.013678040682845095, + "grad_norm": 7.3701090812683105, + "learning_rate": 4.9772032655285916e-05, + "loss": 3.3735, + "step": 44000 + }, + { + "epoch": 0.013833472963331971, + "grad_norm": 6.865908622741699, + "learning_rate": 4.97694421172778e-05, + "loss": 3.3589, + "step": 44500 + }, + { + "epoch": 0.013988905243818846, + "grad_norm": 6.476613521575928, + "learning_rate": 4.976685157926969e-05, + "loss": 3.3761, + "step": 45000 + }, + { + "epoch": 0.014144337524305724, + "grad_norm": 6.928557395935059, + "learning_rate": 4.976426104126157e-05, + "loss": 3.3512, + "step": 45500 + }, + { + "epoch": 0.0142997698047926, + "grad_norm": 6.979996204376221, + "learning_rate": 4.976167050325346e-05, + "loss": 3.3585, + "step": 46000 + }, + { + "epoch": 0.014455202085279475, + "grad_norm": 14.082725524902344, + "learning_rate": 4.9759079965245345e-05, + "loss": 3.3327, + "step": 46500 + }, + { + "epoch": 0.01461063436576635, + "grad_norm": 6.0107197761535645, + "learning_rate": 4.975648942723723e-05, + "loss": 3.345, + "step": 47000 + }, + { + "epoch": 0.014766066646253228, + "grad_norm": 4.881377696990967, + "learning_rate": 4.975389888922912e-05, + "loss": 3.3535, + "step": 47500 + }, + { + "epoch": 0.014921498926740103, + "grad_norm": 7.203432083129883, + "learning_rate": 4.9751308351221e-05, + "loss": 3.3138, + "step": 48000 + }, + { + "epoch": 0.015076931207226979, + "grad_norm": 5.489126682281494, + "learning_rate": 4.974871781321289e-05, + "loss": 3.3398, + "step": 48500 + }, + { + "epoch": 0.015232363487713856, + "grad_norm": 7.7338643074035645, + "learning_rate": 4.9746127275204774e-05, + "loss": 3.3797, + "step": 49000 + }, + { + "epoch": 0.015387795768200731, + "grad_norm": 7.304266452789307, + "learning_rate": 4.9743536737196654e-05, + "loss": 3.3741, + "step": 49500 + }, + { + "epoch": 0.015543228048687607, + "grad_norm": 7.9823832511901855, + "learning_rate": 4.974094619918854e-05, + "loss": 3.3604, + "step": 50000 + }, + { + "epoch": 0.015698660329174483, + "grad_norm": 7.357463836669922, + "learning_rate": 4.973835566118043e-05, + "loss": 3.3759, + "step": 50500 + }, + { + "epoch": 0.01585409260966136, + "grad_norm": 6.471344947814941, + "learning_rate": 4.973576512317231e-05, + "loss": 3.3884, + "step": 51000 + }, + { + "epoch": 0.016009524890148237, + "grad_norm": 7.234046459197998, + "learning_rate": 4.9733174585164196e-05, + "loss": 3.3535, + "step": 51500 + }, + { + "epoch": 0.01616495717063511, + "grad_norm": 6.387459754943848, + "learning_rate": 4.973058404715608e-05, + "loss": 3.3645, + "step": 52000 + }, + { + "epoch": 0.016320389451121988, + "grad_norm": 6.718079090118408, + "learning_rate": 4.972799350914797e-05, + "loss": 3.2952, + "step": 52500 + }, + { + "epoch": 0.016475821731608865, + "grad_norm": 8.60452651977539, + "learning_rate": 4.972540297113986e-05, + "loss": 3.3523, + "step": 53000 + }, + { + "epoch": 0.01663125401209574, + "grad_norm": 9.456363677978516, + "learning_rate": 4.972281243313174e-05, + "loss": 3.36, + "step": 53500 + }, + { + "epoch": 0.016786686292582616, + "grad_norm": 7.311404705047607, + "learning_rate": 4.9720221895123625e-05, + "loss": 3.335, + "step": 54000 + }, + { + "epoch": 0.016942118573069494, + "grad_norm": 7.228612422943115, + "learning_rate": 4.971763135711551e-05, + "loss": 3.3281, + "step": 54500 + }, + { + "epoch": 0.017097550853556368, + "grad_norm": 6.681599140167236, + "learning_rate": 4.971504081910739e-05, + "loss": 3.324, + "step": 55000 + }, + { + "epoch": 0.017252983134043245, + "grad_norm": 5.377241134643555, + "learning_rate": 4.971245028109928e-05, + "loss": 3.308, + "step": 55500 + }, + { + "epoch": 0.017408415414530122, + "grad_norm": 6.771687984466553, + "learning_rate": 4.970985974309117e-05, + "loss": 3.3182, + "step": 56000 + }, + { + "epoch": 0.017563847695016996, + "grad_norm": 8.462624549865723, + "learning_rate": 4.9707269205083054e-05, + "loss": 3.3356, + "step": 56500 + }, + { + "epoch": 0.017719279975503873, + "grad_norm": 6.195849895477295, + "learning_rate": 4.970467866707494e-05, + "loss": 3.3193, + "step": 57000 + }, + { + "epoch": 0.01787471225599075, + "grad_norm": 7.253998279571533, + "learning_rate": 4.970208812906682e-05, + "loss": 3.3274, + "step": 57500 + }, + { + "epoch": 0.018030144536477624, + "grad_norm": 6.2544074058532715, + "learning_rate": 4.969949759105871e-05, + "loss": 3.3216, + "step": 58000 + }, + { + "epoch": 0.0181855768169645, + "grad_norm": 5.746667861938477, + "learning_rate": 4.9696907053050596e-05, + "loss": 3.348, + "step": 58500 + }, + { + "epoch": 0.018341009097451375, + "grad_norm": 6.633561134338379, + "learning_rate": 4.9694316515042476e-05, + "loss": 3.3257, + "step": 59000 + }, + { + "epoch": 0.018496441377938252, + "grad_norm": 7.207036972045898, + "learning_rate": 4.9691725977034364e-05, + "loss": 3.2947, + "step": 59500 + }, + { + "epoch": 0.01865187365842513, + "grad_norm": 6.122809410095215, + "learning_rate": 4.968913543902625e-05, + "loss": 3.3318, + "step": 60000 + }, + { + "epoch": 0.018807305938912004, + "grad_norm": 10.286441802978516, + "learning_rate": 4.968654490101813e-05, + "loss": 3.3754, + "step": 60500 + }, + { + "epoch": 0.01896273821939888, + "grad_norm": 7.422279357910156, + "learning_rate": 4.968395436301002e-05, + "loss": 3.3269, + "step": 61000 + }, + { + "epoch": 0.019118170499885758, + "grad_norm": 7.439267635345459, + "learning_rate": 4.9681363825001905e-05, + "loss": 3.3634, + "step": 61500 + }, + { + "epoch": 0.019273602780372632, + "grad_norm": 8.98193073272705, + "learning_rate": 4.967877328699379e-05, + "loss": 3.307, + "step": 62000 + }, + { + "epoch": 0.01942903506085951, + "grad_norm": 5.688362121582031, + "learning_rate": 4.967618274898568e-05, + "loss": 3.2964, + "step": 62500 + }, + { + "epoch": 0.019584467341346386, + "grad_norm": 8.804769515991211, + "learning_rate": 4.967359221097757e-05, + "loss": 3.3267, + "step": 63000 + }, + { + "epoch": 0.01973989962183326, + "grad_norm": 7.419903755187988, + "learning_rate": 4.967100167296945e-05, + "loss": 3.3166, + "step": 63500 + }, + { + "epoch": 0.019895331902320137, + "grad_norm": 6.8906569480896, + "learning_rate": 4.9668411134961334e-05, + "loss": 3.346, + "step": 64000 + }, + { + "epoch": 0.020050764182807015, + "grad_norm": 6.8461713790893555, + "learning_rate": 4.9665820596953215e-05, + "loss": 3.3266, + "step": 64500 + }, + { + "epoch": 0.02020619646329389, + "grad_norm": 5.569315433502197, + "learning_rate": 4.96632300589451e-05, + "loss": 3.374, + "step": 65000 + }, + { + "epoch": 0.020361628743780766, + "grad_norm": 5.782054901123047, + "learning_rate": 4.966063952093699e-05, + "loss": 3.2958, + "step": 65500 + }, + { + "epoch": 0.020517061024267643, + "grad_norm": 6.8936004638671875, + "learning_rate": 4.9658048982928876e-05, + "loss": 3.286, + "step": 66000 + }, + { + "epoch": 0.020672493304754517, + "grad_norm": 7.049856662750244, + "learning_rate": 4.9655458444920763e-05, + "loss": 3.3146, + "step": 66500 + }, + { + "epoch": 0.020827925585241394, + "grad_norm": 7.000786781311035, + "learning_rate": 4.965286790691265e-05, + "loss": 3.3441, + "step": 67000 + }, + { + "epoch": 0.02098335786572827, + "grad_norm": 6.651955604553223, + "learning_rate": 4.965027736890453e-05, + "loss": 3.3262, + "step": 67500 + }, + { + "epoch": 0.021138790146215145, + "grad_norm": 6.071671485900879, + "learning_rate": 4.964768683089642e-05, + "loss": 3.3051, + "step": 68000 + }, + { + "epoch": 0.021294222426702022, + "grad_norm": 6.597151279449463, + "learning_rate": 4.9645096292888305e-05, + "loss": 3.3539, + "step": 68500 + }, + { + "epoch": 0.0214496547071889, + "grad_norm": 5.767997741699219, + "learning_rate": 4.9642505754880186e-05, + "loss": 3.3209, + "step": 69000 + }, + { + "epoch": 0.021605086987675774, + "grad_norm": 7.603672981262207, + "learning_rate": 4.963991521687207e-05, + "loss": 3.3379, + "step": 69500 + }, + { + "epoch": 0.02176051926816265, + "grad_norm": 6.447910308837891, + "learning_rate": 4.963732467886395e-05, + "loss": 3.3273, + "step": 70000 + }, + { + "epoch": 0.021915951548649528, + "grad_norm": 6.338496208190918, + "learning_rate": 4.963473414085584e-05, + "loss": 3.3109, + "step": 70500 + }, + { + "epoch": 0.022071383829136402, + "grad_norm": 7.420943737030029, + "learning_rate": 4.963214360284773e-05, + "loss": 3.2979, + "step": 71000 + }, + { + "epoch": 0.02222681610962328, + "grad_norm": 7.745577335357666, + "learning_rate": 4.9629553064839615e-05, + "loss": 3.2677, + "step": 71500 + }, + { + "epoch": 0.022382248390110156, + "grad_norm": 7.908437728881836, + "learning_rate": 4.96269625268315e-05, + "loss": 3.3271, + "step": 72000 + }, + { + "epoch": 0.02253768067059703, + "grad_norm": 8.758095741271973, + "learning_rate": 4.962437198882339e-05, + "loss": 3.3121, + "step": 72500 + }, + { + "epoch": 0.022693112951083907, + "grad_norm": 6.864973068237305, + "learning_rate": 4.962178145081527e-05, + "loss": 3.2903, + "step": 73000 + }, + { + "epoch": 0.022848545231570785, + "grad_norm": 6.369244575500488, + "learning_rate": 4.9619190912807156e-05, + "loss": 3.3731, + "step": 73500 + }, + { + "epoch": 0.02300397751205766, + "grad_norm": 6.395854949951172, + "learning_rate": 4.9616600374799044e-05, + "loss": 3.3195, + "step": 74000 + }, + { + "epoch": 0.023159409792544536, + "grad_norm": 6.067129135131836, + "learning_rate": 4.9614009836790924e-05, + "loss": 3.3099, + "step": 74500 + }, + { + "epoch": 0.02331484207303141, + "grad_norm": 7.439714431762695, + "learning_rate": 4.961141929878281e-05, + "loss": 3.3027, + "step": 75000 + }, + { + "epoch": 0.023470274353518287, + "grad_norm": 5.966531276702881, + "learning_rate": 4.960882876077469e-05, + "loss": 3.2979, + "step": 75500 + }, + { + "epoch": 0.023625706634005164, + "grad_norm": 5.246225833892822, + "learning_rate": 4.9606238222766585e-05, + "loss": 3.286, + "step": 76000 + }, + { + "epoch": 0.023781138914492038, + "grad_norm": 16.400691986083984, + "learning_rate": 4.960364768475847e-05, + "loss": 3.2773, + "step": 76500 + }, + { + "epoch": 0.023936571194978915, + "grad_norm": 7.908066272735596, + "learning_rate": 4.960105714675035e-05, + "loss": 3.3137, + "step": 77000 + }, + { + "epoch": 0.024092003475465792, + "grad_norm": 7.265084266662598, + "learning_rate": 4.959846660874224e-05, + "loss": 3.3001, + "step": 77500 + }, + { + "epoch": 0.024247435755952666, + "grad_norm": 8.910933494567871, + "learning_rate": 4.959587607073413e-05, + "loss": 3.3051, + "step": 78000 + }, + { + "epoch": 0.024402868036439544, + "grad_norm": 9.363746643066406, + "learning_rate": 4.959328553272601e-05, + "loss": 3.2842, + "step": 78500 + }, + { + "epoch": 0.02455830031692642, + "grad_norm": 8.87469482421875, + "learning_rate": 4.9590694994717895e-05, + "loss": 3.2933, + "step": 79000 + }, + { + "epoch": 0.024713732597413295, + "grad_norm": 11.57091236114502, + "learning_rate": 4.958810445670978e-05, + "loss": 3.3045, + "step": 79500 + }, + { + "epoch": 0.024869164877900172, + "grad_norm": 8.748586654663086, + "learning_rate": 4.958551391870166e-05, + "loss": 3.2867, + "step": 80000 + }, + { + "epoch": 0.02502459715838705, + "grad_norm": 7.570953845977783, + "learning_rate": 4.958292338069355e-05, + "loss": 3.3287, + "step": 80500 + }, + { + "epoch": 0.025180029438873923, + "grad_norm": 8.075359344482422, + "learning_rate": 4.958033284268544e-05, + "loss": 3.2333, + "step": 81000 + }, + { + "epoch": 0.0253354617193608, + "grad_norm": 4.990267276763916, + "learning_rate": 4.9577742304677324e-05, + "loss": 3.2836, + "step": 81500 + }, + { + "epoch": 0.025490893999847677, + "grad_norm": 8.585162162780762, + "learning_rate": 4.957515176666921e-05, + "loss": 3.3193, + "step": 82000 + }, + { + "epoch": 0.02564632628033455, + "grad_norm": 8.033713340759277, + "learning_rate": 4.957256122866109e-05, + "loss": 3.3299, + "step": 82500 + }, + { + "epoch": 0.02580175856082143, + "grad_norm": 7.623437881469727, + "learning_rate": 4.956997069065298e-05, + "loss": 3.3134, + "step": 83000 + }, + { + "epoch": 0.025957190841308306, + "grad_norm": 6.594669342041016, + "learning_rate": 4.9567380152644866e-05, + "loss": 3.3102, + "step": 83500 + }, + { + "epoch": 0.02611262312179518, + "grad_norm": 10.27623462677002, + "learning_rate": 4.9564789614636746e-05, + "loss": 3.355, + "step": 84000 + }, + { + "epoch": 0.026268055402282057, + "grad_norm": 6.685576915740967, + "learning_rate": 4.956219907662863e-05, + "loss": 3.3094, + "step": 84500 + }, + { + "epoch": 0.026423487682768934, + "grad_norm": 6.233862400054932, + "learning_rate": 4.955960853862052e-05, + "loss": 3.3246, + "step": 85000 + }, + { + "epoch": 0.026578919963255808, + "grad_norm": 10.732805252075195, + "learning_rate": 4.95570180006124e-05, + "loss": 3.2804, + "step": 85500 + }, + { + "epoch": 0.026734352243742685, + "grad_norm": 5.845518589019775, + "learning_rate": 4.9554427462604295e-05, + "loss": 3.3559, + "step": 86000 + }, + { + "epoch": 0.026889784524229562, + "grad_norm": 7.95506477355957, + "learning_rate": 4.955183692459618e-05, + "loss": 3.2989, + "step": 86500 + }, + { + "epoch": 0.027045216804716436, + "grad_norm": 5.403649806976318, + "learning_rate": 4.954924638658806e-05, + "loss": 3.3008, + "step": 87000 + }, + { + "epoch": 0.027200649085203314, + "grad_norm": 6.9295806884765625, + "learning_rate": 4.954665584857995e-05, + "loss": 3.3041, + "step": 87500 + }, + { + "epoch": 0.02735608136569019, + "grad_norm": 8.007198333740234, + "learning_rate": 4.954406531057183e-05, + "loss": 3.3085, + "step": 88000 + }, + { + "epoch": 0.027511513646177065, + "grad_norm": 8.355571746826172, + "learning_rate": 4.954147477256372e-05, + "loss": 3.3369, + "step": 88500 + }, + { + "epoch": 0.027666945926663942, + "grad_norm": 6.723972797393799, + "learning_rate": 4.9538884234555604e-05, + "loss": 3.299, + "step": 89000 + }, + { + "epoch": 0.02782237820715082, + "grad_norm": 9.193892478942871, + "learning_rate": 4.9536293696547484e-05, + "loss": 3.3253, + "step": 89500 + }, + { + "epoch": 0.027977810487637693, + "grad_norm": 9.252047538757324, + "learning_rate": 4.953370315853937e-05, + "loss": 3.2857, + "step": 90000 + }, + { + "epoch": 0.02813324276812457, + "grad_norm": 7.347903728485107, + "learning_rate": 4.953111262053126e-05, + "loss": 3.3071, + "step": 90500 + }, + { + "epoch": 0.028288675048611447, + "grad_norm": 15.721242904663086, + "learning_rate": 4.9528522082523146e-05, + "loss": 3.3513, + "step": 91000 + }, + { + "epoch": 0.02844410732909832, + "grad_norm": 8.270145416259766, + "learning_rate": 4.952593154451503e-05, + "loss": 3.3273, + "step": 91500 + }, + { + "epoch": 0.0285995396095852, + "grad_norm": 6.680537223815918, + "learning_rate": 4.952334100650692e-05, + "loss": 3.29, + "step": 92000 + }, + { + "epoch": 0.028754971890072072, + "grad_norm": 6.206247806549072, + "learning_rate": 4.95207504684988e-05, + "loss": 3.3279, + "step": 92500 + }, + { + "epoch": 0.02891040417055895, + "grad_norm": 7.5918073654174805, + "learning_rate": 4.951815993049069e-05, + "loss": 3.2438, + "step": 93000 + }, + { + "epoch": 0.029065836451045827, + "grad_norm": 6.817627906799316, + "learning_rate": 4.951556939248257e-05, + "loss": 3.2801, + "step": 93500 + }, + { + "epoch": 0.0292212687315327, + "grad_norm": 6.455772876739502, + "learning_rate": 4.9512978854474455e-05, + "loss": 3.2808, + "step": 94000 + }, + { + "epoch": 0.029376701012019578, + "grad_norm": 8.36837100982666, + "learning_rate": 4.951038831646634e-05, + "loss": 3.2875, + "step": 94500 + }, + { + "epoch": 0.029532133292506455, + "grad_norm": 10.20489501953125, + "learning_rate": 4.950779777845822e-05, + "loss": 3.2783, + "step": 95000 + }, + { + "epoch": 0.02968756557299333, + "grad_norm": 6.9074201583862305, + "learning_rate": 4.950520724045012e-05, + "loss": 3.3017, + "step": 95500 + }, + { + "epoch": 0.029842997853480206, + "grad_norm": 8.135815620422363, + "learning_rate": 4.9502616702442004e-05, + "loss": 3.2979, + "step": 96000 + }, + { + "epoch": 0.029998430133967083, + "grad_norm": 7.589613914489746, + "learning_rate": 4.9500026164433884e-05, + "loss": 3.2993, + "step": 96500 + }, + { + "epoch": 0.030153862414453957, + "grad_norm": 6.562385559082031, + "learning_rate": 4.949743562642577e-05, + "loss": 3.2688, + "step": 97000 + }, + { + "epoch": 0.030309294694940835, + "grad_norm": 20.57623863220215, + "learning_rate": 4.949484508841766e-05, + "loss": 3.2918, + "step": 97500 + }, + { + "epoch": 0.030464726975427712, + "grad_norm": 11.39439582824707, + "learning_rate": 4.949225455040954e-05, + "loss": 3.276, + "step": 98000 + }, + { + "epoch": 0.030620159255914586, + "grad_norm": 7.604848384857178, + "learning_rate": 4.9489664012401426e-05, + "loss": 3.2768, + "step": 98500 + }, + { + "epoch": 0.030775591536401463, + "grad_norm": 6.845335960388184, + "learning_rate": 4.948707347439331e-05, + "loss": 3.3231, + "step": 99000 + }, + { + "epoch": 0.03093102381688834, + "grad_norm": 6.965633392333984, + "learning_rate": 4.9484482936385194e-05, + "loss": 3.2656, + "step": 99500 + }, + { + "epoch": 0.031086456097375214, + "grad_norm": 8.290983200073242, + "learning_rate": 4.948189239837708e-05, + "loss": 3.2553, + "step": 100000 + }, + { + "epoch": 0.03124188837786209, + "grad_norm": 6.196207523345947, + "learning_rate": 4.947930186036897e-05, + "loss": 3.287, + "step": 100500 + }, + { + "epoch": 0.031397320658348965, + "grad_norm": 7.205747127532959, + "learning_rate": 4.9476711322360855e-05, + "loss": 3.3308, + "step": 101000 + }, + { + "epoch": 0.031552752938835846, + "grad_norm": 7.304210662841797, + "learning_rate": 4.947412078435274e-05, + "loss": 3.3044, + "step": 101500 + }, + { + "epoch": 0.03170818521932272, + "grad_norm": 7.48699951171875, + "learning_rate": 4.947153024634462e-05, + "loss": 3.2691, + "step": 102000 + }, + { + "epoch": 0.03186361749980959, + "grad_norm": 4.56908655166626, + "learning_rate": 4.946893970833651e-05, + "loss": 3.3114, + "step": 102500 + }, + { + "epoch": 0.032019049780296474, + "grad_norm": 6.150084018707275, + "learning_rate": 4.94663491703284e-05, + "loss": 3.3276, + "step": 103000 + }, + { + "epoch": 0.03217448206078335, + "grad_norm": 7.188169956207275, + "learning_rate": 4.946375863232028e-05, + "loss": 3.2438, + "step": 103500 + }, + { + "epoch": 0.03232991434127022, + "grad_norm": 7.250753402709961, + "learning_rate": 4.9461168094312164e-05, + "loss": 3.2767, + "step": 104000 + }, + { + "epoch": 0.0324853466217571, + "grad_norm": 7.546389579772949, + "learning_rate": 4.945857755630405e-05, + "loss": 3.2827, + "step": 104500 + }, + { + "epoch": 0.032640778902243976, + "grad_norm": 7.198559761047363, + "learning_rate": 4.945598701829593e-05, + "loss": 3.2869, + "step": 105000 + }, + { + "epoch": 0.03279621118273085, + "grad_norm": 7.386605739593506, + "learning_rate": 4.9453396480287826e-05, + "loss": 3.2999, + "step": 105500 + }, + { + "epoch": 0.03295164346321773, + "grad_norm": 7.7611308097839355, + "learning_rate": 4.9450805942279706e-05, + "loss": 3.263, + "step": 106000 + }, + { + "epoch": 0.033107075743704605, + "grad_norm": 7.278429985046387, + "learning_rate": 4.944821540427159e-05, + "loss": 3.3, + "step": 106500 + }, + { + "epoch": 0.03326250802419148, + "grad_norm": 7.537839412689209, + "learning_rate": 4.944562486626348e-05, + "loss": 3.2805, + "step": 107000 + }, + { + "epoch": 0.03341794030467836, + "grad_norm": 8.899394989013672, + "learning_rate": 4.944303432825536e-05, + "loss": 3.2901, + "step": 107500 + }, + { + "epoch": 0.03357337258516523, + "grad_norm": 6.20924711227417, + "learning_rate": 4.944044379024725e-05, + "loss": 3.3047, + "step": 108000 + }, + { + "epoch": 0.03372880486565211, + "grad_norm": 16.79630470275879, + "learning_rate": 4.9437853252239135e-05, + "loss": 3.3065, + "step": 108500 + }, + { + "epoch": 0.03388423714613899, + "grad_norm": 5.915986061096191, + "learning_rate": 4.9435262714231016e-05, + "loss": 3.274, + "step": 109000 + }, + { + "epoch": 0.03403966942662586, + "grad_norm": 11.477309226989746, + "learning_rate": 4.94326721762229e-05, + "loss": 3.2263, + "step": 109500 + }, + { + "epoch": 0.034195101707112735, + "grad_norm": 8.60254955291748, + "learning_rate": 4.943008163821479e-05, + "loss": 3.279, + "step": 110000 + }, + { + "epoch": 0.034350533987599616, + "grad_norm": 7.10599946975708, + "learning_rate": 4.942749110020668e-05, + "loss": 3.3222, + "step": 110500 + }, + { + "epoch": 0.03450596626808649, + "grad_norm": 6.196963310241699, + "learning_rate": 4.9424900562198564e-05, + "loss": 3.3093, + "step": 111000 + }, + { + "epoch": 0.03466139854857336, + "grad_norm": 9.308621406555176, + "learning_rate": 4.9422310024190445e-05, + "loss": 3.2967, + "step": 111500 + }, + { + "epoch": 0.034816830829060244, + "grad_norm": 6.828526496887207, + "learning_rate": 4.941971948618233e-05, + "loss": 3.3329, + "step": 112000 + }, + { + "epoch": 0.03497226310954712, + "grad_norm": 7.003742218017578, + "learning_rate": 4.941712894817422e-05, + "loss": 3.2957, + "step": 112500 + }, + { + "epoch": 0.03512769539003399, + "grad_norm": 8.319876670837402, + "learning_rate": 4.94145384101661e-05, + "loss": 3.2725, + "step": 113000 + }, + { + "epoch": 0.03528312767052087, + "grad_norm": 7.878983974456787, + "learning_rate": 4.9411947872157986e-05, + "loss": 3.272, + "step": 113500 + }, + { + "epoch": 0.035438559951007746, + "grad_norm": 22.191219329833984, + "learning_rate": 4.9409357334149874e-05, + "loss": 3.2822, + "step": 114000 + }, + { + "epoch": 0.03559399223149462, + "grad_norm": 7.173155307769775, + "learning_rate": 4.9406766796141754e-05, + "loss": 3.2518, + "step": 114500 + }, + { + "epoch": 0.0357494245119815, + "grad_norm": 7.309353351593018, + "learning_rate": 4.940417625813364e-05, + "loss": 3.2637, + "step": 115000 + }, + { + "epoch": 0.035904856792468375, + "grad_norm": 25.498620986938477, + "learning_rate": 4.9401585720125535e-05, + "loss": 3.248, + "step": 115500 + }, + { + "epoch": 0.03606028907295525, + "grad_norm": 10.399608612060547, + "learning_rate": 4.9398995182117415e-05, + "loss": 3.3009, + "step": 116000 + }, + { + "epoch": 0.03621572135344212, + "grad_norm": 11.32225513458252, + "learning_rate": 4.93964046441093e-05, + "loss": 3.2935, + "step": 116500 + }, + { + "epoch": 0.036371153633929, + "grad_norm": 6.4315505027771, + "learning_rate": 4.939381410610119e-05, + "loss": 3.27, + "step": 117000 + }, + { + "epoch": 0.03652658591441588, + "grad_norm": 7.148208141326904, + "learning_rate": 4.939122356809307e-05, + "loss": 3.2892, + "step": 117500 + }, + { + "epoch": 0.03668201819490275, + "grad_norm": 6.894265174865723, + "learning_rate": 4.938863303008496e-05, + "loss": 3.2703, + "step": 118000 + }, + { + "epoch": 0.03683745047538963, + "grad_norm": 7.051368713378906, + "learning_rate": 4.938604249207684e-05, + "loss": 3.2665, + "step": 118500 + }, + { + "epoch": 0.036992882755876505, + "grad_norm": 8.530072212219238, + "learning_rate": 4.9383451954068725e-05, + "loss": 3.2636, + "step": 119000 + }, + { + "epoch": 0.03714831503636338, + "grad_norm": 6.964772701263428, + "learning_rate": 4.938086141606061e-05, + "loss": 3.2876, + "step": 119500 + }, + { + "epoch": 0.03730374731685026, + "grad_norm": 4.280052661895752, + "learning_rate": 4.93782708780525e-05, + "loss": 3.3086, + "step": 120000 + }, + { + "epoch": 0.03745917959733713, + "grad_norm": 13.605480194091797, + "learning_rate": 4.9375680340044386e-05, + "loss": 3.3156, + "step": 120500 + }, + { + "epoch": 0.03761461187782401, + "grad_norm": 10.886771202087402, + "learning_rate": 4.937308980203627e-05, + "loss": 3.2804, + "step": 121000 + }, + { + "epoch": 0.03777004415831089, + "grad_norm": 8.688804626464844, + "learning_rate": 4.9370499264028154e-05, + "loss": 3.3054, + "step": 121500 + }, + { + "epoch": 0.03792547643879776, + "grad_norm": 6.350268363952637, + "learning_rate": 4.936790872602004e-05, + "loss": 3.2163, + "step": 122000 + }, + { + "epoch": 0.038080908719284635, + "grad_norm": 6.3469014167785645, + "learning_rate": 4.936531818801193e-05, + "loss": 3.2884, + "step": 122500 + }, + { + "epoch": 0.038236340999771516, + "grad_norm": 5.850036144256592, + "learning_rate": 4.936272765000381e-05, + "loss": 3.285, + "step": 123000 + }, + { + "epoch": 0.03839177328025839, + "grad_norm": 6.235698223114014, + "learning_rate": 4.9360137111995696e-05, + "loss": 3.2871, + "step": 123500 + }, + { + "epoch": 0.038547205560745264, + "grad_norm": 7.8410964012146, + "learning_rate": 4.9357546573987576e-05, + "loss": 3.2656, + "step": 124000 + }, + { + "epoch": 0.038702637841232145, + "grad_norm": 8.010100364685059, + "learning_rate": 4.935495603597946e-05, + "loss": 3.2638, + "step": 124500 + }, + { + "epoch": 0.03885807012171902, + "grad_norm": 6.821716785430908, + "learning_rate": 4.935236549797135e-05, + "loss": 3.2604, + "step": 125000 + }, + { + "epoch": 0.03901350240220589, + "grad_norm": 7.1146087646484375, + "learning_rate": 4.934977495996324e-05, + "loss": 3.2386, + "step": 125500 + }, + { + "epoch": 0.03916893468269277, + "grad_norm": 8.275018692016602, + "learning_rate": 4.9347184421955125e-05, + "loss": 3.2706, + "step": 126000 + }, + { + "epoch": 0.03932436696317965, + "grad_norm": 7.271422386169434, + "learning_rate": 4.934459388394701e-05, + "loss": 3.2768, + "step": 126500 + }, + { + "epoch": 0.03947979924366652, + "grad_norm": 8.554838180541992, + "learning_rate": 4.934200334593889e-05, + "loss": 3.2435, + "step": 127000 + }, + { + "epoch": 0.0396352315241534, + "grad_norm": 9.456535339355469, + "learning_rate": 4.933941280793078e-05, + "loss": 3.2813, + "step": 127500 + }, + { + "epoch": 0.039790663804640275, + "grad_norm": 4.948188304901123, + "learning_rate": 4.9336822269922666e-05, + "loss": 3.2451, + "step": 128000 + }, + { + "epoch": 0.03994609608512715, + "grad_norm": 9.876341819763184, + "learning_rate": 4.933423173191455e-05, + "loss": 3.2685, + "step": 128500 + }, + { + "epoch": 0.04010152836561403, + "grad_norm": 8.443554878234863, + "learning_rate": 4.9331641193906434e-05, + "loss": 3.2284, + "step": 129000 + }, + { + "epoch": 0.0402569606461009, + "grad_norm": 6.402748107910156, + "learning_rate": 4.932905065589832e-05, + "loss": 3.2762, + "step": 129500 + }, + { + "epoch": 0.04041239292658778, + "grad_norm": 5.875990867614746, + "learning_rate": 4.932646011789021e-05, + "loss": 3.2826, + "step": 130000 + }, + { + "epoch": 0.04056782520707466, + "grad_norm": 6.314565658569336, + "learning_rate": 4.9323869579882095e-05, + "loss": 3.2748, + "step": 130500 + }, + { + "epoch": 0.04072325748756153, + "grad_norm": 6.96849250793457, + "learning_rate": 4.9321279041873976e-05, + "loss": 3.242, + "step": 131000 + }, + { + "epoch": 0.040878689768048405, + "grad_norm": 11.430787086486816, + "learning_rate": 4.931868850386586e-05, + "loss": 3.2725, + "step": 131500 + }, + { + "epoch": 0.041034122048535286, + "grad_norm": 6.7984209060668945, + "learning_rate": 4.931609796585775e-05, + "loss": 3.2545, + "step": 132000 + }, + { + "epoch": 0.04118955432902216, + "grad_norm": 7.563739776611328, + "learning_rate": 4.931350742784963e-05, + "loss": 3.2717, + "step": 132500 + }, + { + "epoch": 0.041344986609509034, + "grad_norm": 7.5692458152771, + "learning_rate": 4.931091688984152e-05, + "loss": 3.2477, + "step": 133000 + }, + { + "epoch": 0.041500418889995914, + "grad_norm": 7.434717655181885, + "learning_rate": 4.9308326351833405e-05, + "loss": 3.2695, + "step": 133500 + }, + { + "epoch": 0.04165585117048279, + "grad_norm": 9.108682632446289, + "learning_rate": 4.9305735813825285e-05, + "loss": 3.2535, + "step": 134000 + }, + { + "epoch": 0.04181128345096966, + "grad_norm": 8.31109619140625, + "learning_rate": 4.930314527581717e-05, + "loss": 3.2722, + "step": 134500 + }, + { + "epoch": 0.04196671573145654, + "grad_norm": 7.823109149932861, + "learning_rate": 4.930055473780906e-05, + "loss": 3.2652, + "step": 135000 + }, + { + "epoch": 0.04212214801194342, + "grad_norm": 8.341950416564941, + "learning_rate": 4.929796419980095e-05, + "loss": 3.2694, + "step": 135500 + }, + { + "epoch": 0.04227758029243029, + "grad_norm": 7.029598712921143, + "learning_rate": 4.9295373661792834e-05, + "loss": 3.221, + "step": 136000 + }, + { + "epoch": 0.04243301257291717, + "grad_norm": 4.522034645080566, + "learning_rate": 4.9292783123784714e-05, + "loss": 3.247, + "step": 136500 + }, + { + "epoch": 0.042588444853404045, + "grad_norm": 18.831592559814453, + "learning_rate": 4.92901925857766e-05, + "loss": 3.1957, + "step": 137000 + }, + { + "epoch": 0.04274387713389092, + "grad_norm": 9.79848575592041, + "learning_rate": 4.928760204776849e-05, + "loss": 3.2936, + "step": 137500 + }, + { + "epoch": 0.0428993094143778, + "grad_norm": 8.270401000976562, + "learning_rate": 4.928501150976037e-05, + "loss": 3.2454, + "step": 138000 + }, + { + "epoch": 0.04305474169486467, + "grad_norm": 9.428062438964844, + "learning_rate": 4.9282420971752256e-05, + "loss": 3.2389, + "step": 138500 + }, + { + "epoch": 0.04321017397535155, + "grad_norm": 8.271885871887207, + "learning_rate": 4.927983043374414e-05, + "loss": 3.2695, + "step": 139000 + }, + { + "epoch": 0.04336560625583843, + "grad_norm": 6.140800952911377, + "learning_rate": 4.927723989573603e-05, + "loss": 3.3655, + "step": 139500 + }, + { + "epoch": 0.0435210385363253, + "grad_norm": 8.188580513000488, + "learning_rate": 4.927464935772792e-05, + "loss": 3.2595, + "step": 140000 + }, + { + "epoch": 0.043676470816812175, + "grad_norm": 6.794388771057129, + "learning_rate": 4.9272058819719805e-05, + "loss": 3.2826, + "step": 140500 + }, + { + "epoch": 0.043831903097299056, + "grad_norm": 7.900724411010742, + "learning_rate": 4.9269468281711685e-05, + "loss": 3.2533, + "step": 141000 + }, + { + "epoch": 0.04398733537778593, + "grad_norm": 8.62564468383789, + "learning_rate": 4.926687774370357e-05, + "loss": 3.2312, + "step": 141500 + }, + { + "epoch": 0.044142767658272804, + "grad_norm": 12.566315650939941, + "learning_rate": 4.926428720569545e-05, + "loss": 3.2244, + "step": 142000 + }, + { + "epoch": 0.044298199938759684, + "grad_norm": 7.130504608154297, + "learning_rate": 4.926169666768734e-05, + "loss": 3.2714, + "step": 142500 + }, + { + "epoch": 0.04445363221924656, + "grad_norm": 9.469749450683594, + "learning_rate": 4.925910612967923e-05, + "loss": 3.222, + "step": 143000 + }, + { + "epoch": 0.04460906449973343, + "grad_norm": 8.149520874023438, + "learning_rate": 4.925651559167111e-05, + "loss": 3.2954, + "step": 143500 + }, + { + "epoch": 0.04476449678022031, + "grad_norm": 8.395315170288086, + "learning_rate": 4.9253925053662994e-05, + "loss": 3.296, + "step": 144000 + }, + { + "epoch": 0.04491992906070719, + "grad_norm": 11.059592247009277, + "learning_rate": 4.925133451565488e-05, + "loss": 3.254, + "step": 144500 + }, + { + "epoch": 0.04507536134119406, + "grad_norm": 8.070878982543945, + "learning_rate": 4.924874397764677e-05, + "loss": 3.2614, + "step": 145000 + }, + { + "epoch": 0.04523079362168094, + "grad_norm": 6.906238079071045, + "learning_rate": 4.9246153439638656e-05, + "loss": 3.2879, + "step": 145500 + }, + { + "epoch": 0.045386225902167815, + "grad_norm": 7.845452785491943, + "learning_rate": 4.924356290163054e-05, + "loss": 3.2192, + "step": 146000 + }, + { + "epoch": 0.04554165818265469, + "grad_norm": 8.124655723571777, + "learning_rate": 4.924097236362242e-05, + "loss": 3.2137, + "step": 146500 + }, + { + "epoch": 0.04569709046314157, + "grad_norm": 6.884174346923828, + "learning_rate": 4.923838182561431e-05, + "loss": 3.2387, + "step": 147000 + }, + { + "epoch": 0.04585252274362844, + "grad_norm": 8.385683059692383, + "learning_rate": 4.92357912876062e-05, + "loss": 3.2766, + "step": 147500 + }, + { + "epoch": 0.04600795502411532, + "grad_norm": 7.88958740234375, + "learning_rate": 4.923320074959808e-05, + "loss": 3.243, + "step": 148000 + }, + { + "epoch": 0.0461633873046022, + "grad_norm": 8.76440143585205, + "learning_rate": 4.9230610211589965e-05, + "loss": 3.2667, + "step": 148500 + }, + { + "epoch": 0.04631881958508907, + "grad_norm": 6.732757568359375, + "learning_rate": 4.9228019673581846e-05, + "loss": 3.2702, + "step": 149000 + }, + { + "epoch": 0.046474251865575945, + "grad_norm": 8.219280242919922, + "learning_rate": 4.922542913557374e-05, + "loss": 3.2621, + "step": 149500 + }, + { + "epoch": 0.04662968414606282, + "grad_norm": 7.277721881866455, + "learning_rate": 4.922283859756563e-05, + "loss": 3.2271, + "step": 150000 + }, + { + "epoch": 0.0467851164265497, + "grad_norm": 6.747356414794922, + "learning_rate": 4.922024805955751e-05, + "loss": 3.2924, + "step": 150500 + }, + { + "epoch": 0.046940548707036574, + "grad_norm": 9.166511535644531, + "learning_rate": 4.9217657521549394e-05, + "loss": 3.2189, + "step": 151000 + }, + { + "epoch": 0.04709598098752345, + "grad_norm": 15.40404987335205, + "learning_rate": 4.921506698354128e-05, + "loss": 3.2712, + "step": 151500 + }, + { + "epoch": 0.04725141326801033, + "grad_norm": 10.033618927001953, + "learning_rate": 4.921247644553316e-05, + "loss": 3.2703, + "step": 152000 + }, + { + "epoch": 0.0474068455484972, + "grad_norm": 7.278961658477783, + "learning_rate": 4.920988590752505e-05, + "loss": 3.2897, + "step": 152500 + }, + { + "epoch": 0.047562277828984076, + "grad_norm": 9.212068557739258, + "learning_rate": 4.9207295369516936e-05, + "loss": 3.249, + "step": 153000 + }, + { + "epoch": 0.04771771010947096, + "grad_norm": 8.490942001342773, + "learning_rate": 4.9204704831508816e-05, + "loss": 3.2498, + "step": 153500 + }, + { + "epoch": 0.04787314238995783, + "grad_norm": 9.180543899536133, + "learning_rate": 4.9202114293500704e-05, + "loss": 3.2077, + "step": 154000 + }, + { + "epoch": 0.048028574670444704, + "grad_norm": 43.037052154541016, + "learning_rate": 4.919952375549259e-05, + "loss": 3.1985, + "step": 154500 + }, + { + "epoch": 0.048184006950931585, + "grad_norm": 7.551147937774658, + "learning_rate": 4.919693321748448e-05, + "loss": 3.2114, + "step": 155000 + }, + { + "epoch": 0.04833943923141846, + "grad_norm": 10.053205490112305, + "learning_rate": 4.9194342679476365e-05, + "loss": 3.2318, + "step": 155500 + }, + { + "epoch": 0.04849487151190533, + "grad_norm": 8.22573471069336, + "learning_rate": 4.9191752141468245e-05, + "loss": 3.2229, + "step": 156000 + }, + { + "epoch": 0.04865030379239221, + "grad_norm": 8.297768592834473, + "learning_rate": 4.918916160346013e-05, + "loss": 3.3048, + "step": 156500 + }, + { + "epoch": 0.04880573607287909, + "grad_norm": 11.772842407226562, + "learning_rate": 4.918657106545202e-05, + "loss": 3.2496, + "step": 157000 + }, + { + "epoch": 0.04896116835336596, + "grad_norm": 6.4050798416137695, + "learning_rate": 4.91839805274439e-05, + "loss": 3.2441, + "step": 157500 + }, + { + "epoch": 0.04911660063385284, + "grad_norm": 9.75626277923584, + "learning_rate": 4.918138998943579e-05, + "loss": 3.2708, + "step": 158000 + }, + { + "epoch": 0.049272032914339715, + "grad_norm": 9.631768226623535, + "learning_rate": 4.9178799451427674e-05, + "loss": 3.2505, + "step": 158500 + }, + { + "epoch": 0.04942746519482659, + "grad_norm": 8.149422645568848, + "learning_rate": 4.9176208913419555e-05, + "loss": 3.2513, + "step": 159000 + }, + { + "epoch": 0.04958289747531347, + "grad_norm": 7.355200290679932, + "learning_rate": 4.917361837541145e-05, + "loss": 3.3156, + "step": 159500 + }, + { + "epoch": 0.049738329755800344, + "grad_norm": 8.070119857788086, + "learning_rate": 4.917102783740333e-05, + "loss": 3.2602, + "step": 160000 + }, + { + "epoch": 0.04989376203628722, + "grad_norm": 21.377656936645508, + "learning_rate": 4.9168437299395216e-05, + "loss": 3.2709, + "step": 160500 + }, + { + "epoch": 0.0500491943167741, + "grad_norm": 7.195154190063477, + "learning_rate": 4.91658467613871e-05, + "loss": 3.284, + "step": 161000 + }, + { + "epoch": 0.05020462659726097, + "grad_norm": 11.610124588012695, + "learning_rate": 4.9163256223378984e-05, + "loss": 3.2648, + "step": 161500 + }, + { + "epoch": 0.050360058877747846, + "grad_norm": 5.237759113311768, + "learning_rate": 4.916066568537087e-05, + "loss": 3.2858, + "step": 162000 + }, + { + "epoch": 0.05051549115823473, + "grad_norm": 7.234272480010986, + "learning_rate": 4.915807514736276e-05, + "loss": 3.2651, + "step": 162500 + }, + { + "epoch": 0.0506709234387216, + "grad_norm": 8.917335510253906, + "learning_rate": 4.915548460935464e-05, + "loss": 3.2525, + "step": 163000 + }, + { + "epoch": 0.050826355719208474, + "grad_norm": 37.20659637451172, + "learning_rate": 4.9152894071346526e-05, + "loss": 3.2713, + "step": 163500 + }, + { + "epoch": 0.050981787999695355, + "grad_norm": 11.743026733398438, + "learning_rate": 4.915030353333841e-05, + "loss": 3.3106, + "step": 164000 + }, + { + "epoch": 0.05113722028018223, + "grad_norm": 6.836338043212891, + "learning_rate": 4.91477129953303e-05, + "loss": 3.2223, + "step": 164500 + }, + { + "epoch": 0.0512926525606691, + "grad_norm": 7.108793258666992, + "learning_rate": 4.914512245732219e-05, + "loss": 3.2482, + "step": 165000 + }, + { + "epoch": 0.05144808484115598, + "grad_norm": 7.715301990509033, + "learning_rate": 4.9142531919314074e-05, + "loss": 3.3073, + "step": 165500 + }, + { + "epoch": 0.05160351712164286, + "grad_norm": 5.652644634246826, + "learning_rate": 4.9139941381305955e-05, + "loss": 3.2412, + "step": 166000 + }, + { + "epoch": 0.05175894940212973, + "grad_norm": 8.151137351989746, + "learning_rate": 4.913735084329784e-05, + "loss": 3.2407, + "step": 166500 + }, + { + "epoch": 0.05191438168261661, + "grad_norm": 8.031861305236816, + "learning_rate": 4.913476030528972e-05, + "loss": 3.2045, + "step": 167000 + }, + { + "epoch": 0.052069813963103485, + "grad_norm": 7.878591537475586, + "learning_rate": 4.913216976728161e-05, + "loss": 3.2439, + "step": 167500 + }, + { + "epoch": 0.05222524624359036, + "grad_norm": 5.9089131355285645, + "learning_rate": 4.9129579229273496e-05, + "loss": 3.2455, + "step": 168000 + }, + { + "epoch": 0.05238067852407724, + "grad_norm": 6.7528605461120605, + "learning_rate": 4.912698869126538e-05, + "loss": 3.2159, + "step": 168500 + }, + { + "epoch": 0.052536110804564114, + "grad_norm": 6.99171781539917, + "learning_rate": 4.9124398153257264e-05, + "loss": 3.2064, + "step": 169000 + }, + { + "epoch": 0.05269154308505099, + "grad_norm": 6.956116199493408, + "learning_rate": 4.912180761524916e-05, + "loss": 3.2149, + "step": 169500 + }, + { + "epoch": 0.05284697536553787, + "grad_norm": 7.512195110321045, + "learning_rate": 4.911921707724104e-05, + "loss": 3.2697, + "step": 170000 + }, + { + "epoch": 0.05300240764602474, + "grad_norm": 9.229897499084473, + "learning_rate": 4.9116626539232925e-05, + "loss": 3.2444, + "step": 170500 + }, + { + "epoch": 0.053157839926511616, + "grad_norm": 35.42271423339844, + "learning_rate": 4.911403600122481e-05, + "loss": 3.2238, + "step": 171000 + }, + { + "epoch": 0.0533132722069985, + "grad_norm": 25.83500099182129, + "learning_rate": 4.911144546321669e-05, + "loss": 3.206, + "step": 171500 + }, + { + "epoch": 0.05346870448748537, + "grad_norm": 12.679227828979492, + "learning_rate": 4.910885492520858e-05, + "loss": 3.1989, + "step": 172000 + }, + { + "epoch": 0.053624136767972244, + "grad_norm": 6.039393901824951, + "learning_rate": 4.910626438720046e-05, + "loss": 3.2345, + "step": 172500 + }, + { + "epoch": 0.053779569048459125, + "grad_norm": 9.594693183898926, + "learning_rate": 4.910367384919235e-05, + "loss": 3.1919, + "step": 173000 + }, + { + "epoch": 0.053935001328946, + "grad_norm": 7.429599285125732, + "learning_rate": 4.9101083311184235e-05, + "loss": 3.2215, + "step": 173500 + }, + { + "epoch": 0.05409043360943287, + "grad_norm": 11.02073860168457, + "learning_rate": 4.909849277317612e-05, + "loss": 3.2543, + "step": 174000 + }, + { + "epoch": 0.05424586588991975, + "grad_norm": 8.402997016906738, + "learning_rate": 4.909590223516801e-05, + "loss": 3.1957, + "step": 174500 + }, + { + "epoch": 0.05440129817040663, + "grad_norm": 6.675948619842529, + "learning_rate": 4.9093311697159896e-05, + "loss": 3.2324, + "step": 175000 + }, + { + "epoch": 0.0545567304508935, + "grad_norm": 6.487427711486816, + "learning_rate": 4.9090721159151777e-05, + "loss": 3.286, + "step": 175500 + }, + { + "epoch": 0.05471216273138038, + "grad_norm": 7.768229961395264, + "learning_rate": 4.9088130621143664e-05, + "loss": 3.2365, + "step": 176000 + }, + { + "epoch": 0.054867595011867255, + "grad_norm": 7.819295883178711, + "learning_rate": 4.908554008313555e-05, + "loss": 3.2649, + "step": 176500 + }, + { + "epoch": 0.05502302729235413, + "grad_norm": 7.512353420257568, + "learning_rate": 4.908294954512743e-05, + "loss": 3.2367, + "step": 177000 + }, + { + "epoch": 0.05517845957284101, + "grad_norm": 6.278263568878174, + "learning_rate": 4.908035900711932e-05, + "loss": 3.2485, + "step": 177500 + }, + { + "epoch": 0.055333891853327884, + "grad_norm": 5.294346809387207, + "learning_rate": 4.90777684691112e-05, + "loss": 3.2631, + "step": 178000 + }, + { + "epoch": 0.05548932413381476, + "grad_norm": 8.160414695739746, + "learning_rate": 4.9075177931103086e-05, + "loss": 3.2684, + "step": 178500 + }, + { + "epoch": 0.05564475641430164, + "grad_norm": 6.310203552246094, + "learning_rate": 4.907258739309497e-05, + "loss": 3.2501, + "step": 179000 + }, + { + "epoch": 0.05580018869478851, + "grad_norm": 9.393582344055176, + "learning_rate": 4.906999685508686e-05, + "loss": 3.1891, + "step": 179500 + }, + { + "epoch": 0.055955620975275386, + "grad_norm": 7.76079797744751, + "learning_rate": 4.906740631707875e-05, + "loss": 3.2674, + "step": 180000 + }, + { + "epoch": 0.05611105325576227, + "grad_norm": 7.362579345703125, + "learning_rate": 4.9064815779070635e-05, + "loss": 3.2235, + "step": 180500 + }, + { + "epoch": 0.05626648553624914, + "grad_norm": 8.193130493164062, + "learning_rate": 4.9062225241062515e-05, + "loss": 3.2715, + "step": 181000 + }, + { + "epoch": 0.056421917816736014, + "grad_norm": 8.241288185119629, + "learning_rate": 4.90596347030544e-05, + "loss": 3.2064, + "step": 181500 + }, + { + "epoch": 0.056577350097222895, + "grad_norm": 8.529385566711426, + "learning_rate": 4.905704416504629e-05, + "loss": 3.2845, + "step": 182000 + }, + { + "epoch": 0.05673278237770977, + "grad_norm": 5.807302951812744, + "learning_rate": 4.905445362703817e-05, + "loss": 3.2213, + "step": 182500 + }, + { + "epoch": 0.05688821465819664, + "grad_norm": 16.199304580688477, + "learning_rate": 4.905186308903006e-05, + "loss": 3.2586, + "step": 183000 + }, + { + "epoch": 0.057043646938683516, + "grad_norm": 7.160610198974609, + "learning_rate": 4.9049272551021944e-05, + "loss": 3.2199, + "step": 183500 + }, + { + "epoch": 0.0571990792191704, + "grad_norm": 5.65518856048584, + "learning_rate": 4.904668201301383e-05, + "loss": 3.2383, + "step": 184000 + }, + { + "epoch": 0.05735451149965727, + "grad_norm": 12.30341911315918, + "learning_rate": 4.904409147500572e-05, + "loss": 3.2444, + "step": 184500 + }, + { + "epoch": 0.057509943780144145, + "grad_norm": 6.117470741271973, + "learning_rate": 4.90415009369976e-05, + "loss": 3.2239, + "step": 185000 + }, + { + "epoch": 0.057665376060631025, + "grad_norm": 5.765038967132568, + "learning_rate": 4.9038910398989486e-05, + "loss": 3.2299, + "step": 185500 + }, + { + "epoch": 0.0578208083411179, + "grad_norm": 9.672142028808594, + "learning_rate": 4.903631986098137e-05, + "loss": 3.243, + "step": 186000 + }, + { + "epoch": 0.05797624062160477, + "grad_norm": 7.054978370666504, + "learning_rate": 4.903372932297325e-05, + "loss": 3.2327, + "step": 186500 + }, + { + "epoch": 0.058131672902091654, + "grad_norm": 7.177043914794922, + "learning_rate": 4.903113878496514e-05, + "loss": 3.2302, + "step": 187000 + }, + { + "epoch": 0.05828710518257853, + "grad_norm": 8.694104194641113, + "learning_rate": 4.902854824695703e-05, + "loss": 3.2413, + "step": 187500 + }, + { + "epoch": 0.0584425374630654, + "grad_norm": 7.815042018890381, + "learning_rate": 4.902595770894891e-05, + "loss": 3.2182, + "step": 188000 + }, + { + "epoch": 0.05859796974355228, + "grad_norm": 7.331467628479004, + "learning_rate": 4.9023367170940795e-05, + "loss": 3.2214, + "step": 188500 + }, + { + "epoch": 0.058753402024039156, + "grad_norm": 7.23491907119751, + "learning_rate": 4.902077663293268e-05, + "loss": 3.2407, + "step": 189000 + }, + { + "epoch": 0.05890883430452603, + "grad_norm": 6.387929439544678, + "learning_rate": 4.901818609492457e-05, + "loss": 3.2192, + "step": 189500 + }, + { + "epoch": 0.05906426658501291, + "grad_norm": 7.360497951507568, + "learning_rate": 4.9015595556916457e-05, + "loss": 3.1831, + "step": 190000 + }, + { + "epoch": 0.059219698865499784, + "grad_norm": 8.599339485168457, + "learning_rate": 4.901300501890834e-05, + "loss": 3.208, + "step": 190500 + }, + { + "epoch": 0.05937513114598666, + "grad_norm": 17.31464195251465, + "learning_rate": 4.9010414480900224e-05, + "loss": 3.2495, + "step": 191000 + }, + { + "epoch": 0.05953056342647354, + "grad_norm": 6.988974094390869, + "learning_rate": 4.900782394289211e-05, + "loss": 3.2134, + "step": 191500 + }, + { + "epoch": 0.05968599570696041, + "grad_norm": 6.462131023406982, + "learning_rate": 4.900523340488399e-05, + "loss": 3.2236, + "step": 192000 + }, + { + "epoch": 0.059841427987447286, + "grad_norm": 12.840971946716309, + "learning_rate": 4.900264286687588e-05, + "loss": 3.2055, + "step": 192500 + }, + { + "epoch": 0.05999686026793417, + "grad_norm": 6.309732437133789, + "learning_rate": 4.9000052328867766e-05, + "loss": 3.2617, + "step": 193000 + }, + { + "epoch": 0.06015229254842104, + "grad_norm": 7.95692777633667, + "learning_rate": 4.899746179085965e-05, + "loss": 3.2453, + "step": 193500 + }, + { + "epoch": 0.060307724828907915, + "grad_norm": 9.405898094177246, + "learning_rate": 4.899487125285154e-05, + "loss": 3.1951, + "step": 194000 + }, + { + "epoch": 0.060463157109394795, + "grad_norm": 7.388848304748535, + "learning_rate": 4.899228071484343e-05, + "loss": 3.1811, + "step": 194500 + }, + { + "epoch": 0.06061858938988167, + "grad_norm": 6.787569522857666, + "learning_rate": 4.898969017683531e-05, + "loss": 3.2525, + "step": 195000 + }, + { + "epoch": 0.06077402167036854, + "grad_norm": 7.843801975250244, + "learning_rate": 4.8987099638827195e-05, + "loss": 3.2015, + "step": 195500 + }, + { + "epoch": 0.060929453950855424, + "grad_norm": 8.681680679321289, + "learning_rate": 4.8984509100819075e-05, + "loss": 3.1701, + "step": 196000 + }, + { + "epoch": 0.0610848862313423, + "grad_norm": 8.731649398803711, + "learning_rate": 4.898191856281096e-05, + "loss": 3.2214, + "step": 196500 + }, + { + "epoch": 0.06124031851182917, + "grad_norm": 11.529400825500488, + "learning_rate": 4.897932802480285e-05, + "loss": 3.216, + "step": 197000 + }, + { + "epoch": 0.06139575079231605, + "grad_norm": 7.0222649574279785, + "learning_rate": 4.897673748679473e-05, + "loss": 3.2115, + "step": 197500 + }, + { + "epoch": 0.061551183072802926, + "grad_norm": 6.572511196136475, + "learning_rate": 4.897414694878662e-05, + "loss": 3.2086, + "step": 198000 + }, + { + "epoch": 0.0617066153532898, + "grad_norm": 9.180594444274902, + "learning_rate": 4.8971556410778504e-05, + "loss": 3.2151, + "step": 198500 + }, + { + "epoch": 0.06186204763377668, + "grad_norm": 8.414937973022461, + "learning_rate": 4.896896587277039e-05, + "loss": 3.2585, + "step": 199000 + }, + { + "epoch": 0.062017479914263554, + "grad_norm": 7.715460300445557, + "learning_rate": 4.896637533476228e-05, + "loss": 3.2205, + "step": 199500 + }, + { + "epoch": 0.06217291219475043, + "grad_norm": 12.017910957336426, + "learning_rate": 4.8963784796754166e-05, + "loss": 3.2773, + "step": 200000 + }, + { + "epoch": 0.06232834447523731, + "grad_norm": 14.353538513183594, + "learning_rate": 4.8961194258746046e-05, + "loss": 3.2084, + "step": 200500 + }, + { + "epoch": 0.06248377675572418, + "grad_norm": 6.6596221923828125, + "learning_rate": 4.895860372073793e-05, + "loss": 3.2117, + "step": 201000 + }, + { + "epoch": 0.06263920903621106, + "grad_norm": 6.184107780456543, + "learning_rate": 4.895601318272982e-05, + "loss": 3.2299, + "step": 201500 + }, + { + "epoch": 0.06279464131669793, + "grad_norm": 6.977295398712158, + "learning_rate": 4.89534226447217e-05, + "loss": 3.2323, + "step": 202000 + }, + { + "epoch": 0.06295007359718481, + "grad_norm": 11.614173889160156, + "learning_rate": 4.895083210671359e-05, + "loss": 3.2216, + "step": 202500 + }, + { + "epoch": 0.06310550587767169, + "grad_norm": 5.946804523468018, + "learning_rate": 4.8948241568705475e-05, + "loss": 3.2097, + "step": 203000 + }, + { + "epoch": 0.06326093815815856, + "grad_norm": 7.623793125152588, + "learning_rate": 4.894565103069736e-05, + "loss": 3.2501, + "step": 203500 + }, + { + "epoch": 0.06341637043864544, + "grad_norm": 7.162341117858887, + "learning_rate": 4.894306049268925e-05, + "loss": 3.2211, + "step": 204000 + }, + { + "epoch": 0.06357180271913232, + "grad_norm": 6.36505126953125, + "learning_rate": 4.894046995468113e-05, + "loss": 3.2721, + "step": 204500 + }, + { + "epoch": 0.06372723499961919, + "grad_norm": 7.10728120803833, + "learning_rate": 4.893787941667302e-05, + "loss": 3.2191, + "step": 205000 + }, + { + "epoch": 0.06388266728010607, + "grad_norm": 6.6028337478637695, + "learning_rate": 4.8935288878664904e-05, + "loss": 3.2282, + "step": 205500 + }, + { + "epoch": 0.06403809956059295, + "grad_norm": 6.434904098510742, + "learning_rate": 4.8932698340656785e-05, + "loss": 3.2746, + "step": 206000 + }, + { + "epoch": 0.06419353184107982, + "grad_norm": 9.137332916259766, + "learning_rate": 4.893010780264867e-05, + "loss": 3.2219, + "step": 206500 + }, + { + "epoch": 0.0643489641215667, + "grad_norm": 9.64322280883789, + "learning_rate": 4.892751726464056e-05, + "loss": 3.2215, + "step": 207000 + }, + { + "epoch": 0.06450439640205358, + "grad_norm": 6.212767124176025, + "learning_rate": 4.892492672663244e-05, + "loss": 3.2198, + "step": 207500 + }, + { + "epoch": 0.06465982868254044, + "grad_norm": 7.7452287673950195, + "learning_rate": 4.8922336188624326e-05, + "loss": 3.1835, + "step": 208000 + }, + { + "epoch": 0.06481526096302732, + "grad_norm": 4.948238372802734, + "learning_rate": 4.8919745650616214e-05, + "loss": 3.2068, + "step": 208500 + }, + { + "epoch": 0.0649706932435142, + "grad_norm": 7.793570041656494, + "learning_rate": 4.89171551126081e-05, + "loss": 3.2354, + "step": 209000 + }, + { + "epoch": 0.06512612552400107, + "grad_norm": 9.318753242492676, + "learning_rate": 4.891456457459999e-05, + "loss": 3.2519, + "step": 209500 + }, + { + "epoch": 0.06528155780448795, + "grad_norm": 7.704009532928467, + "learning_rate": 4.891197403659187e-05, + "loss": 3.2075, + "step": 210000 + }, + { + "epoch": 0.06543699008497483, + "grad_norm": 6.476907253265381, + "learning_rate": 4.8909383498583755e-05, + "loss": 3.2608, + "step": 210500 + }, + { + "epoch": 0.0655924223654617, + "grad_norm": 8.052131652832031, + "learning_rate": 4.890679296057564e-05, + "loss": 3.2421, + "step": 211000 + }, + { + "epoch": 0.06574785464594858, + "grad_norm": 8.085298538208008, + "learning_rate": 4.890420242256752e-05, + "loss": 3.236, + "step": 211500 + }, + { + "epoch": 0.06590328692643546, + "grad_norm": 12.321160316467285, + "learning_rate": 4.890161188455941e-05, + "loss": 3.2035, + "step": 212000 + }, + { + "epoch": 0.06605871920692233, + "grad_norm": 9.641210556030273, + "learning_rate": 4.88990213465513e-05, + "loss": 3.1396, + "step": 212500 + }, + { + "epoch": 0.06621415148740921, + "grad_norm": 14.246781349182129, + "learning_rate": 4.8896430808543184e-05, + "loss": 3.2097, + "step": 213000 + }, + { + "epoch": 0.06636958376789609, + "grad_norm": 7.92242431640625, + "learning_rate": 4.889384027053507e-05, + "loss": 3.1949, + "step": 213500 + }, + { + "epoch": 0.06652501604838296, + "grad_norm": 9.400290489196777, + "learning_rate": 4.889124973252695e-05, + "loss": 3.2448, + "step": 214000 + }, + { + "epoch": 0.06668044832886984, + "grad_norm": 7.383910179138184, + "learning_rate": 4.888865919451884e-05, + "loss": 3.1919, + "step": 214500 + }, + { + "epoch": 0.06683588060935672, + "grad_norm": 8.542524337768555, + "learning_rate": 4.8886068656510726e-05, + "loss": 3.2215, + "step": 215000 + }, + { + "epoch": 0.06699131288984359, + "grad_norm": 7.764432430267334, + "learning_rate": 4.8883478118502607e-05, + "loss": 3.2439, + "step": 215500 + }, + { + "epoch": 0.06714674517033047, + "grad_norm": 9.842550277709961, + "learning_rate": 4.8880887580494494e-05, + "loss": 3.2234, + "step": 216000 + }, + { + "epoch": 0.06730217745081735, + "grad_norm": 10.360559463500977, + "learning_rate": 4.887829704248638e-05, + "loss": 3.2264, + "step": 216500 + }, + { + "epoch": 0.06745760973130421, + "grad_norm": 7.008344650268555, + "learning_rate": 4.887570650447826e-05, + "loss": 3.2065, + "step": 217000 + }, + { + "epoch": 0.0676130420117911, + "grad_norm": 6.914950370788574, + "learning_rate": 4.887311596647015e-05, + "loss": 3.259, + "step": 217500 + }, + { + "epoch": 0.06776847429227797, + "grad_norm": 6.836582660675049, + "learning_rate": 4.8870525428462036e-05, + "loss": 3.2522, + "step": 218000 + }, + { + "epoch": 0.06792390657276484, + "grad_norm": 11.225698471069336, + "learning_rate": 4.886793489045392e-05, + "loss": 3.1984, + "step": 218500 + }, + { + "epoch": 0.06807933885325172, + "grad_norm": 7.416784286499023, + "learning_rate": 4.886534435244581e-05, + "loss": 3.2139, + "step": 219000 + }, + { + "epoch": 0.0682347711337386, + "grad_norm": 6.64291524887085, + "learning_rate": 4.88627538144377e-05, + "loss": 3.2114, + "step": 219500 + }, + { + "epoch": 0.06839020341422547, + "grad_norm": 7.293720245361328, + "learning_rate": 4.886016327642958e-05, + "loss": 3.2354, + "step": 220000 + }, + { + "epoch": 0.06854563569471235, + "grad_norm": 8.037970542907715, + "learning_rate": 4.8857572738421465e-05, + "loss": 3.2078, + "step": 220500 + }, + { + "epoch": 0.06870106797519923, + "grad_norm": 7.397656440734863, + "learning_rate": 4.8854982200413345e-05, + "loss": 3.1981, + "step": 221000 + }, + { + "epoch": 0.0688565002556861, + "grad_norm": 7.2382307052612305, + "learning_rate": 4.885239166240523e-05, + "loss": 3.2054, + "step": 221500 + }, + { + "epoch": 0.06901193253617298, + "grad_norm": 4.801016807556152, + "learning_rate": 4.884980112439712e-05, + "loss": 3.2363, + "step": 222000 + }, + { + "epoch": 0.06916736481665986, + "grad_norm": 7.711859703063965, + "learning_rate": 4.8847210586389e-05, + "loss": 3.2197, + "step": 222500 + }, + { + "epoch": 0.06932279709714673, + "grad_norm": 7.242246627807617, + "learning_rate": 4.8844620048380894e-05, + "loss": 3.2553, + "step": 223000 + }, + { + "epoch": 0.06947822937763361, + "grad_norm": 7.284317970275879, + "learning_rate": 4.884202951037278e-05, + "loss": 3.1905, + "step": 223500 + }, + { + "epoch": 0.06963366165812049, + "grad_norm": 6.082415580749512, + "learning_rate": 4.883943897236466e-05, + "loss": 3.1779, + "step": 224000 + }, + { + "epoch": 0.06978909393860735, + "grad_norm": 6.855501174926758, + "learning_rate": 4.883684843435655e-05, + "loss": 3.1945, + "step": 224500 + }, + { + "epoch": 0.06994452621909424, + "grad_norm": 7.898929119110107, + "learning_rate": 4.8834257896348435e-05, + "loss": 3.2857, + "step": 225000 + }, + { + "epoch": 0.07009995849958112, + "grad_norm": 12.047196388244629, + "learning_rate": 4.8831667358340316e-05, + "loss": 3.1983, + "step": 225500 + }, + { + "epoch": 0.07025539078006798, + "grad_norm": 17.747129440307617, + "learning_rate": 4.88290768203322e-05, + "loss": 3.2048, + "step": 226000 + }, + { + "epoch": 0.07041082306055486, + "grad_norm": 6.5752434730529785, + "learning_rate": 4.882648628232408e-05, + "loss": 3.2078, + "step": 226500 + }, + { + "epoch": 0.07056625534104174, + "grad_norm": 13.643658638000488, + "learning_rate": 4.882389574431597e-05, + "loss": 3.2326, + "step": 227000 + }, + { + "epoch": 0.07072168762152861, + "grad_norm": 8.102971076965332, + "learning_rate": 4.882130520630786e-05, + "loss": 3.2111, + "step": 227500 + }, + { + "epoch": 0.07087711990201549, + "grad_norm": 6.61681604385376, + "learning_rate": 4.8818714668299745e-05, + "loss": 3.2115, + "step": 228000 + }, + { + "epoch": 0.07103255218250237, + "grad_norm": 9.59108829498291, + "learning_rate": 4.881612413029163e-05, + "loss": 3.2366, + "step": 228500 + }, + { + "epoch": 0.07118798446298924, + "grad_norm": 4.4234747886657715, + "learning_rate": 4.881353359228352e-05, + "loss": 3.1837, + "step": 229000 + }, + { + "epoch": 0.07134341674347612, + "grad_norm": 19.284870147705078, + "learning_rate": 4.88109430542754e-05, + "loss": 3.2167, + "step": 229500 + }, + { + "epoch": 0.071498849023963, + "grad_norm": 7.419887065887451, + "learning_rate": 4.8808352516267287e-05, + "loss": 3.2182, + "step": 230000 + }, + { + "epoch": 0.07165428130444987, + "grad_norm": 16.27370834350586, + "learning_rate": 4.8805761978259174e-05, + "loss": 3.2263, + "step": 230500 + }, + { + "epoch": 0.07180971358493675, + "grad_norm": 9.477789878845215, + "learning_rate": 4.8803171440251054e-05, + "loss": 3.211, + "step": 231000 + }, + { + "epoch": 0.07196514586542363, + "grad_norm": 6.208843231201172, + "learning_rate": 4.880058090224294e-05, + "loss": 3.2387, + "step": 231500 + }, + { + "epoch": 0.0721205781459105, + "grad_norm": 6.0574727058410645, + "learning_rate": 4.879799036423482e-05, + "loss": 3.1859, + "step": 232000 + }, + { + "epoch": 0.07227601042639738, + "grad_norm": 8.017576217651367, + "learning_rate": 4.879539982622671e-05, + "loss": 3.2286, + "step": 232500 + }, + { + "epoch": 0.07243144270688424, + "grad_norm": 15.00483226776123, + "learning_rate": 4.87928092882186e-05, + "loss": 3.2158, + "step": 233000 + }, + { + "epoch": 0.07258687498737112, + "grad_norm": 12.09419059753418, + "learning_rate": 4.879021875021048e-05, + "loss": 3.2118, + "step": 233500 + }, + { + "epoch": 0.072742307267858, + "grad_norm": 6.928121566772461, + "learning_rate": 4.878762821220237e-05, + "loss": 3.206, + "step": 234000 + }, + { + "epoch": 0.07289773954834487, + "grad_norm": 6.442709922790527, + "learning_rate": 4.878503767419426e-05, + "loss": 3.2215, + "step": 234500 + }, + { + "epoch": 0.07305317182883175, + "grad_norm": 11.090587615966797, + "learning_rate": 4.878244713618614e-05, + "loss": 3.2346, + "step": 235000 + }, + { + "epoch": 0.07320860410931863, + "grad_norm": 7.514273166656494, + "learning_rate": 4.8779856598178025e-05, + "loss": 3.1802, + "step": 235500 + }, + { + "epoch": 0.0733640363898055, + "grad_norm": 10.104327201843262, + "learning_rate": 4.877726606016991e-05, + "loss": 3.2097, + "step": 236000 + }, + { + "epoch": 0.07351946867029238, + "grad_norm": 12.392394065856934, + "learning_rate": 4.877467552216179e-05, + "loss": 3.2223, + "step": 236500 + }, + { + "epoch": 0.07367490095077926, + "grad_norm": 4.820774078369141, + "learning_rate": 4.877208498415368e-05, + "loss": 3.2487, + "step": 237000 + }, + { + "epoch": 0.07383033323126613, + "grad_norm": 6.830599784851074, + "learning_rate": 4.876949444614557e-05, + "loss": 3.2162, + "step": 237500 + }, + { + "epoch": 0.07398576551175301, + "grad_norm": 6.318206787109375, + "learning_rate": 4.8766903908137454e-05, + "loss": 3.2138, + "step": 238000 + }, + { + "epoch": 0.07414119779223989, + "grad_norm": 5.971734046936035, + "learning_rate": 4.876431337012934e-05, + "loss": 3.1578, + "step": 238500 + }, + { + "epoch": 0.07429663007272676, + "grad_norm": 7.2676286697387695, + "learning_rate": 4.876172283212122e-05, + "loss": 3.2481, + "step": 239000 + }, + { + "epoch": 0.07445206235321364, + "grad_norm": 9.364696502685547, + "learning_rate": 4.875913229411311e-05, + "loss": 3.2058, + "step": 239500 + }, + { + "epoch": 0.07460749463370052, + "grad_norm": 6.917110919952393, + "learning_rate": 4.8756541756104996e-05, + "loss": 3.2286, + "step": 240000 + }, + { + "epoch": 0.07476292691418739, + "grad_norm": 16.5022029876709, + "learning_rate": 4.8753951218096876e-05, + "loss": 3.1939, + "step": 240500 + }, + { + "epoch": 0.07491835919467427, + "grad_norm": 8.112979888916016, + "learning_rate": 4.875136068008876e-05, + "loss": 3.1896, + "step": 241000 + }, + { + "epoch": 0.07507379147516115, + "grad_norm": 6.664256572723389, + "learning_rate": 4.874877014208065e-05, + "loss": 3.196, + "step": 241500 + }, + { + "epoch": 0.07522922375564801, + "grad_norm": 10.617268562316895, + "learning_rate": 4.874617960407253e-05, + "loss": 3.2346, + "step": 242000 + }, + { + "epoch": 0.0753846560361349, + "grad_norm": 6.6674394607543945, + "learning_rate": 4.874358906606442e-05, + "loss": 3.2433, + "step": 242500 + }, + { + "epoch": 0.07554008831662178, + "grad_norm": 15.126607894897461, + "learning_rate": 4.874099852805631e-05, + "loss": 3.1917, + "step": 243000 + }, + { + "epoch": 0.07569552059710864, + "grad_norm": 13.694937705993652, + "learning_rate": 4.873840799004819e-05, + "loss": 3.208, + "step": 243500 + }, + { + "epoch": 0.07585095287759552, + "grad_norm": 6.781734466552734, + "learning_rate": 4.873581745204008e-05, + "loss": 3.238, + "step": 244000 + }, + { + "epoch": 0.0760063851580824, + "grad_norm": 7.6213250160217285, + "learning_rate": 4.873322691403196e-05, + "loss": 3.2156, + "step": 244500 + }, + { + "epoch": 0.07616181743856927, + "grad_norm": 7.336849689483643, + "learning_rate": 4.873063637602385e-05, + "loss": 3.2376, + "step": 245000 + }, + { + "epoch": 0.07631724971905615, + "grad_norm": 7.281228065490723, + "learning_rate": 4.8728045838015734e-05, + "loss": 3.1876, + "step": 245500 + }, + { + "epoch": 0.07647268199954303, + "grad_norm": 8.91719913482666, + "learning_rate": 4.8725455300007614e-05, + "loss": 3.2133, + "step": 246000 + }, + { + "epoch": 0.0766281142800299, + "grad_norm": 5.959069728851318, + "learning_rate": 4.87228647619995e-05, + "loss": 3.2354, + "step": 246500 + }, + { + "epoch": 0.07678354656051678, + "grad_norm": 10.434021949768066, + "learning_rate": 4.872027422399139e-05, + "loss": 3.2234, + "step": 247000 + }, + { + "epoch": 0.07693897884100366, + "grad_norm": 6.6049370765686035, + "learning_rate": 4.8717683685983276e-05, + "loss": 3.1943, + "step": 247500 + }, + { + "epoch": 0.07709441112149053, + "grad_norm": 7.049847602844238, + "learning_rate": 4.871509314797516e-05, + "loss": 3.1934, + "step": 248000 + }, + { + "epoch": 0.07724984340197741, + "grad_norm": 7.233822822570801, + "learning_rate": 4.871250260996705e-05, + "loss": 3.2372, + "step": 248500 + }, + { + "epoch": 0.07740527568246429, + "grad_norm": 6.111257076263428, + "learning_rate": 4.870991207195893e-05, + "loss": 3.1831, + "step": 249000 + }, + { + "epoch": 0.07756070796295116, + "grad_norm": 11.214299201965332, + "learning_rate": 4.870732153395082e-05, + "loss": 3.1965, + "step": 249500 + }, + { + "epoch": 0.07771614024343804, + "grad_norm": 7.178966045379639, + "learning_rate": 4.87047309959427e-05, + "loss": 3.2031, + "step": 250000 + }, + { + "epoch": 0.07787157252392492, + "grad_norm": 23.425161361694336, + "learning_rate": 4.8702140457934585e-05, + "loss": 3.2459, + "step": 250500 + }, + { + "epoch": 0.07802700480441178, + "grad_norm": 8.266075134277344, + "learning_rate": 4.869954991992647e-05, + "loss": 3.2125, + "step": 251000 + }, + { + "epoch": 0.07818243708489866, + "grad_norm": 5.159907341003418, + "learning_rate": 4.869695938191835e-05, + "loss": 3.1561, + "step": 251500 + }, + { + "epoch": 0.07833786936538555, + "grad_norm": 8.876725196838379, + "learning_rate": 4.869436884391024e-05, + "loss": 3.2209, + "step": 252000 + }, + { + "epoch": 0.07849330164587241, + "grad_norm": 6.805834770202637, + "learning_rate": 4.869177830590213e-05, + "loss": 3.1618, + "step": 252500 + }, + { + "epoch": 0.0786487339263593, + "grad_norm": 6.923367977142334, + "learning_rate": 4.8689187767894014e-05, + "loss": 3.2202, + "step": 253000 + }, + { + "epoch": 0.07880416620684617, + "grad_norm": 7.328836441040039, + "learning_rate": 4.86865972298859e-05, + "loss": 3.1961, + "step": 253500 + }, + { + "epoch": 0.07895959848733304, + "grad_norm": 6.718837738037109, + "learning_rate": 4.868400669187779e-05, + "loss": 3.2317, + "step": 254000 + }, + { + "epoch": 0.07911503076781992, + "grad_norm": 9.168951034545898, + "learning_rate": 4.868141615386967e-05, + "loss": 3.2022, + "step": 254500 + }, + { + "epoch": 0.0792704630483068, + "grad_norm": 7.528100967407227, + "learning_rate": 4.8678825615861556e-05, + "loss": 3.1991, + "step": 255000 + }, + { + "epoch": 0.07942589532879367, + "grad_norm": 6.064104080200195, + "learning_rate": 4.867623507785344e-05, + "loss": 3.1896, + "step": 255500 + }, + { + "epoch": 0.07958132760928055, + "grad_norm": 9.363104820251465, + "learning_rate": 4.8673644539845324e-05, + "loss": 3.2219, + "step": 256000 + }, + { + "epoch": 0.07973675988976743, + "grad_norm": 10.28849983215332, + "learning_rate": 4.867105400183721e-05, + "loss": 3.1796, + "step": 256500 + }, + { + "epoch": 0.0798921921702543, + "grad_norm": 8.1656494140625, + "learning_rate": 4.86684634638291e-05, + "loss": 3.2182, + "step": 257000 + }, + { + "epoch": 0.08004762445074118, + "grad_norm": 7.798013687133789, + "learning_rate": 4.8665872925820985e-05, + "loss": 3.2292, + "step": 257500 + }, + { + "epoch": 0.08020305673122806, + "grad_norm": 7.97374963760376, + "learning_rate": 4.866328238781287e-05, + "loss": 3.2142, + "step": 258000 + }, + { + "epoch": 0.08035848901171493, + "grad_norm": 6.899406909942627, + "learning_rate": 4.866069184980475e-05, + "loss": 3.2043, + "step": 258500 + }, + { + "epoch": 0.0805139212922018, + "grad_norm": 6.984812259674072, + "learning_rate": 4.865810131179664e-05, + "loss": 3.1847, + "step": 259000 + }, + { + "epoch": 0.08066935357268869, + "grad_norm": 6.737573146820068, + "learning_rate": 4.865551077378853e-05, + "loss": 3.1993, + "step": 259500 + }, + { + "epoch": 0.08082478585317555, + "grad_norm": 23.836368560791016, + "learning_rate": 4.865292023578041e-05, + "loss": 3.1976, + "step": 260000 + }, + { + "epoch": 0.08098021813366243, + "grad_norm": 9.042171478271484, + "learning_rate": 4.8650329697772295e-05, + "loss": 3.1986, + "step": 260500 + }, + { + "epoch": 0.08113565041414932, + "grad_norm": 6.311880111694336, + "learning_rate": 4.864773915976418e-05, + "loss": 3.1561, + "step": 261000 + }, + { + "epoch": 0.08129108269463618, + "grad_norm": 7.330997467041016, + "learning_rate": 4.864514862175606e-05, + "loss": 3.2065, + "step": 261500 + }, + { + "epoch": 0.08144651497512306, + "grad_norm": 13.30780029296875, + "learning_rate": 4.864255808374795e-05, + "loss": 3.2683, + "step": 262000 + }, + { + "epoch": 0.08160194725560994, + "grad_norm": 7.215512275695801, + "learning_rate": 4.8639967545739836e-05, + "loss": 3.2059, + "step": 262500 + }, + { + "epoch": 0.08175737953609681, + "grad_norm": 10.715143203735352, + "learning_rate": 4.8637377007731723e-05, + "loss": 3.2189, + "step": 263000 + }, + { + "epoch": 0.08191281181658369, + "grad_norm": 7.577348232269287, + "learning_rate": 4.863478646972361e-05, + "loss": 3.2159, + "step": 263500 + }, + { + "epoch": 0.08206824409707057, + "grad_norm": 6.198236465454102, + "learning_rate": 4.863219593171549e-05, + "loss": 3.2117, + "step": 264000 + }, + { + "epoch": 0.08222367637755744, + "grad_norm": 7.623655796051025, + "learning_rate": 4.862960539370738e-05, + "loss": 3.2505, + "step": 264500 + }, + { + "epoch": 0.08237910865804432, + "grad_norm": 6.779893398284912, + "learning_rate": 4.8627014855699265e-05, + "loss": 3.2235, + "step": 265000 + }, + { + "epoch": 0.0825345409385312, + "grad_norm": 7.1539201736450195, + "learning_rate": 4.8624424317691146e-05, + "loss": 3.1534, + "step": 265500 + }, + { + "epoch": 0.08268997321901807, + "grad_norm": 9.191695213317871, + "learning_rate": 4.862183377968303e-05, + "loss": 3.1818, + "step": 266000 + }, + { + "epoch": 0.08284540549950495, + "grad_norm": 7.667037487030029, + "learning_rate": 4.861924324167492e-05, + "loss": 3.1805, + "step": 266500 + }, + { + "epoch": 0.08300083777999183, + "grad_norm": 7.823062896728516, + "learning_rate": 4.861665270366681e-05, + "loss": 3.2188, + "step": 267000 + }, + { + "epoch": 0.0831562700604787, + "grad_norm": 5.846076488494873, + "learning_rate": 4.8614062165658694e-05, + "loss": 3.1411, + "step": 267500 + }, + { + "epoch": 0.08331170234096558, + "grad_norm": 4.7285308837890625, + "learning_rate": 4.861147162765058e-05, + "loss": 3.2305, + "step": 268000 + }, + { + "epoch": 0.08346713462145246, + "grad_norm": 8.750801086425781, + "learning_rate": 4.860888108964246e-05, + "loss": 3.1844, + "step": 268500 + }, + { + "epoch": 0.08362256690193932, + "grad_norm": 8.580436706542969, + "learning_rate": 4.860629055163435e-05, + "loss": 3.1971, + "step": 269000 + }, + { + "epoch": 0.0837779991824262, + "grad_norm": 7.626330375671387, + "learning_rate": 4.860370001362623e-05, + "loss": 3.2126, + "step": 269500 + }, + { + "epoch": 0.08393343146291309, + "grad_norm": 15.679354667663574, + "learning_rate": 4.8601109475618117e-05, + "loss": 3.2041, + "step": 270000 + }, + { + "epoch": 0.08408886374339995, + "grad_norm": 6.373791217803955, + "learning_rate": 4.8598518937610004e-05, + "loss": 3.2274, + "step": 270500 + }, + { + "epoch": 0.08424429602388683, + "grad_norm": 6.1450910568237305, + "learning_rate": 4.8595928399601884e-05, + "loss": 3.1993, + "step": 271000 + }, + { + "epoch": 0.08439972830437371, + "grad_norm": 6.499091625213623, + "learning_rate": 4.859333786159377e-05, + "loss": 3.1898, + "step": 271500 + }, + { + "epoch": 0.08455516058486058, + "grad_norm": 8.027620315551758, + "learning_rate": 4.859074732358566e-05, + "loss": 3.2406, + "step": 272000 + }, + { + "epoch": 0.08471059286534746, + "grad_norm": 8.333386421203613, + "learning_rate": 4.8588156785577546e-05, + "loss": 3.1809, + "step": 272500 + }, + { + "epoch": 0.08486602514583434, + "grad_norm": 8.851384162902832, + "learning_rate": 4.858556624756943e-05, + "loss": 3.1958, + "step": 273000 + }, + { + "epoch": 0.08502145742632121, + "grad_norm": 12.166316032409668, + "learning_rate": 4.858297570956132e-05, + "loss": 3.1922, + "step": 273500 + }, + { + "epoch": 0.08517688970680809, + "grad_norm": 17.51677894592285, + "learning_rate": 4.85803851715532e-05, + "loss": 3.1759, + "step": 274000 + }, + { + "epoch": 0.08533232198729497, + "grad_norm": 8.503355026245117, + "learning_rate": 4.857779463354509e-05, + "loss": 3.1815, + "step": 274500 + }, + { + "epoch": 0.08548775426778184, + "grad_norm": 6.137835502624512, + "learning_rate": 4.857520409553697e-05, + "loss": 3.1803, + "step": 275000 + }, + { + "epoch": 0.08564318654826872, + "grad_norm": 6.6335859298706055, + "learning_rate": 4.8572613557528855e-05, + "loss": 3.206, + "step": 275500 + }, + { + "epoch": 0.0857986188287556, + "grad_norm": 7.227497100830078, + "learning_rate": 4.857002301952074e-05, + "loss": 3.1833, + "step": 276000 + }, + { + "epoch": 0.08595405110924247, + "grad_norm": 9.480046272277832, + "learning_rate": 4.856743248151263e-05, + "loss": 3.227, + "step": 276500 + }, + { + "epoch": 0.08610948338972935, + "grad_norm": 9.173969268798828, + "learning_rate": 4.8564841943504516e-05, + "loss": 3.2231, + "step": 277000 + }, + { + "epoch": 0.08626491567021623, + "grad_norm": 9.73849868774414, + "learning_rate": 4.8562251405496404e-05, + "loss": 3.2024, + "step": 277500 + }, + { + "epoch": 0.0864203479507031, + "grad_norm": 8.863982200622559, + "learning_rate": 4.8559660867488284e-05, + "loss": 3.2136, + "step": 278000 + }, + { + "epoch": 0.08657578023118997, + "grad_norm": 11.830785751342773, + "learning_rate": 4.855707032948017e-05, + "loss": 3.1722, + "step": 278500 + }, + { + "epoch": 0.08673121251167686, + "grad_norm": 5.768320083618164, + "learning_rate": 4.855447979147206e-05, + "loss": 3.2395, + "step": 279000 + }, + { + "epoch": 0.08688664479216372, + "grad_norm": 6.222928524017334, + "learning_rate": 4.855188925346394e-05, + "loss": 3.1894, + "step": 279500 + }, + { + "epoch": 0.0870420770726506, + "grad_norm": 9.109928131103516, + "learning_rate": 4.8549298715455826e-05, + "loss": 3.2405, + "step": 280000 + }, + { + "epoch": 0.08719750935313748, + "grad_norm": 7.896403789520264, + "learning_rate": 4.8546708177447706e-05, + "loss": 3.1874, + "step": 280500 + }, + { + "epoch": 0.08735294163362435, + "grad_norm": 7.94150447845459, + "learning_rate": 4.854411763943959e-05, + "loss": 3.1895, + "step": 281000 + }, + { + "epoch": 0.08750837391411123, + "grad_norm": 6.938539028167725, + "learning_rate": 4.854152710143148e-05, + "loss": 3.1995, + "step": 281500 + }, + { + "epoch": 0.08766380619459811, + "grad_norm": 7.414036750793457, + "learning_rate": 4.853893656342337e-05, + "loss": 3.1814, + "step": 282000 + }, + { + "epoch": 0.08781923847508498, + "grad_norm": 10.229009628295898, + "learning_rate": 4.8536346025415255e-05, + "loss": 3.1822, + "step": 282500 + }, + { + "epoch": 0.08797467075557186, + "grad_norm": 7.596944808959961, + "learning_rate": 4.853375548740714e-05, + "loss": 3.2085, + "step": 283000 + }, + { + "epoch": 0.08813010303605874, + "grad_norm": 7.168403148651123, + "learning_rate": 4.853116494939902e-05, + "loss": 3.2052, + "step": 283500 + }, + { + "epoch": 0.08828553531654561, + "grad_norm": 22.484220504760742, + "learning_rate": 4.852857441139091e-05, + "loss": 3.2063, + "step": 284000 + }, + { + "epoch": 0.08844096759703249, + "grad_norm": 7.56538724899292, + "learning_rate": 4.8525983873382797e-05, + "loss": 3.2323, + "step": 284500 + }, + { + "epoch": 0.08859639987751937, + "grad_norm": 7.964419364929199, + "learning_rate": 4.852339333537468e-05, + "loss": 3.1623, + "step": 285000 + }, + { + "epoch": 0.08875183215800624, + "grad_norm": 10.760733604431152, + "learning_rate": 4.8520802797366564e-05, + "loss": 3.1809, + "step": 285500 + }, + { + "epoch": 0.08890726443849312, + "grad_norm": 8.079580307006836, + "learning_rate": 4.851821225935845e-05, + "loss": 3.1947, + "step": 286000 + }, + { + "epoch": 0.08906269671898, + "grad_norm": 14.81601619720459, + "learning_rate": 4.851562172135034e-05, + "loss": 3.1466, + "step": 286500 + }, + { + "epoch": 0.08921812899946686, + "grad_norm": 6.316641330718994, + "learning_rate": 4.8513031183342226e-05, + "loss": 3.1946, + "step": 287000 + }, + { + "epoch": 0.08937356127995374, + "grad_norm": 7.919250965118408, + "learning_rate": 4.8510440645334106e-05, + "loss": 3.1945, + "step": 287500 + }, + { + "epoch": 0.08952899356044063, + "grad_norm": 9.761019706726074, + "learning_rate": 4.850785010732599e-05, + "loss": 3.1571, + "step": 288000 + }, + { + "epoch": 0.08968442584092749, + "grad_norm": 7.806896686553955, + "learning_rate": 4.850525956931788e-05, + "loss": 3.223, + "step": 288500 + }, + { + "epoch": 0.08983985812141437, + "grad_norm": 7.051902770996094, + "learning_rate": 4.850266903130976e-05, + "loss": 3.2218, + "step": 289000 + }, + { + "epoch": 0.08999529040190125, + "grad_norm": 6.751678943634033, + "learning_rate": 4.850007849330165e-05, + "loss": 3.1684, + "step": 289500 + }, + { + "epoch": 0.09015072268238812, + "grad_norm": 7.63497257232666, + "learning_rate": 4.8497487955293535e-05, + "loss": 3.2542, + "step": 290000 + }, + { + "epoch": 0.090306154962875, + "grad_norm": 16.189546585083008, + "learning_rate": 4.8494897417285415e-05, + "loss": 3.2299, + "step": 290500 + }, + { + "epoch": 0.09046158724336188, + "grad_norm": 9.000436782836914, + "learning_rate": 4.84923068792773e-05, + "loss": 3.1966, + "step": 291000 + }, + { + "epoch": 0.09061701952384875, + "grad_norm": 7.031053066253662, + "learning_rate": 4.848971634126919e-05, + "loss": 3.2002, + "step": 291500 + }, + { + "epoch": 0.09077245180433563, + "grad_norm": 7.924676895141602, + "learning_rate": 4.848712580326108e-05, + "loss": 3.1924, + "step": 292000 + }, + { + "epoch": 0.09092788408482251, + "grad_norm": 8.287687301635742, + "learning_rate": 4.8484535265252964e-05, + "loss": 3.1687, + "step": 292500 + }, + { + "epoch": 0.09108331636530938, + "grad_norm": 8.479589462280273, + "learning_rate": 4.8481944727244844e-05, + "loss": 3.1523, + "step": 293000 + }, + { + "epoch": 0.09123874864579626, + "grad_norm": 7.596410751342773, + "learning_rate": 4.847935418923673e-05, + "loss": 3.1881, + "step": 293500 + }, + { + "epoch": 0.09139418092628314, + "grad_norm": 7.639554977416992, + "learning_rate": 4.847676365122862e-05, + "loss": 3.1766, + "step": 294000 + }, + { + "epoch": 0.09154961320677, + "grad_norm": 5.0725884437561035, + "learning_rate": 4.84741731132205e-05, + "loss": 3.1637, + "step": 294500 + }, + { + "epoch": 0.09170504548725689, + "grad_norm": 10.078112602233887, + "learning_rate": 4.8471582575212386e-05, + "loss": 3.1455, + "step": 295000 + }, + { + "epoch": 0.09186047776774377, + "grad_norm": 5.912460803985596, + "learning_rate": 4.846899203720427e-05, + "loss": 3.21, + "step": 295500 + }, + { + "epoch": 0.09201591004823063, + "grad_norm": 7.294102191925049, + "learning_rate": 4.8466401499196154e-05, + "loss": 3.1722, + "step": 296000 + }, + { + "epoch": 0.09217134232871751, + "grad_norm": 4.877582550048828, + "learning_rate": 4.846381096118805e-05, + "loss": 3.1872, + "step": 296500 + }, + { + "epoch": 0.0923267746092044, + "grad_norm": 6.889306545257568, + "learning_rate": 4.8461220423179935e-05, + "loss": 3.1839, + "step": 297000 + }, + { + "epoch": 0.09248220688969126, + "grad_norm": 7.839190483093262, + "learning_rate": 4.8458629885171815e-05, + "loss": 3.1723, + "step": 297500 + }, + { + "epoch": 0.09263763917017814, + "grad_norm": 18.454544067382812, + "learning_rate": 4.84560393471637e-05, + "loss": 3.1885, + "step": 298000 + }, + { + "epoch": 0.09279307145066502, + "grad_norm": 5.680841445922852, + "learning_rate": 4.845344880915558e-05, + "loss": 3.148, + "step": 298500 + }, + { + "epoch": 0.09294850373115189, + "grad_norm": 13.747638702392578, + "learning_rate": 4.845085827114747e-05, + "loss": 3.2082, + "step": 299000 + }, + { + "epoch": 0.09310393601163877, + "grad_norm": 5.722404479980469, + "learning_rate": 4.844826773313936e-05, + "loss": 3.1835, + "step": 299500 + }, + { + "epoch": 0.09325936829212564, + "grad_norm": 8.772673606872559, + "learning_rate": 4.844567719513124e-05, + "loss": 3.1537, + "step": 300000 + }, + { + "epoch": 0.09341480057261252, + "grad_norm": 6.989293575286865, + "learning_rate": 4.8443086657123124e-05, + "loss": 3.1952, + "step": 300500 + }, + { + "epoch": 0.0935702328530994, + "grad_norm": 8.088149070739746, + "learning_rate": 4.844049611911501e-05, + "loss": 3.1899, + "step": 301000 + }, + { + "epoch": 0.09372566513358627, + "grad_norm": 10.632081031799316, + "learning_rate": 4.84379055811069e-05, + "loss": 3.2196, + "step": 301500 + }, + { + "epoch": 0.09388109741407315, + "grad_norm": 9.196609497070312, + "learning_rate": 4.8435315043098786e-05, + "loss": 3.2292, + "step": 302000 + }, + { + "epoch": 0.09403652969456003, + "grad_norm": 7.169755935668945, + "learning_rate": 4.843272450509067e-05, + "loss": 3.1833, + "step": 302500 + }, + { + "epoch": 0.0941919619750469, + "grad_norm": 8.304545402526855, + "learning_rate": 4.8430133967082553e-05, + "loss": 3.1801, + "step": 303000 + }, + { + "epoch": 0.09434739425553378, + "grad_norm": 8.64903736114502, + "learning_rate": 4.842754342907444e-05, + "loss": 3.155, + "step": 303500 + }, + { + "epoch": 0.09450282653602066, + "grad_norm": 12.639022827148438, + "learning_rate": 4.842495289106633e-05, + "loss": 3.2107, + "step": 304000 + }, + { + "epoch": 0.09465825881650752, + "grad_norm": 8.557443618774414, + "learning_rate": 4.842236235305821e-05, + "loss": 3.1564, + "step": 304500 + }, + { + "epoch": 0.0948136910969944, + "grad_norm": 11.042366981506348, + "learning_rate": 4.8419771815050095e-05, + "loss": 3.1857, + "step": 305000 + }, + { + "epoch": 0.09496912337748128, + "grad_norm": 6.05155086517334, + "learning_rate": 4.8417181277041976e-05, + "loss": 3.2172, + "step": 305500 + }, + { + "epoch": 0.09512455565796815, + "grad_norm": 6.857419490814209, + "learning_rate": 4.841459073903386e-05, + "loss": 3.2009, + "step": 306000 + }, + { + "epoch": 0.09527998793845503, + "grad_norm": 7.850050926208496, + "learning_rate": 4.841200020102576e-05, + "loss": 3.182, + "step": 306500 + }, + { + "epoch": 0.09543542021894191, + "grad_norm": 8.689955711364746, + "learning_rate": 4.840940966301764e-05, + "loss": 3.1844, + "step": 307000 + }, + { + "epoch": 0.09559085249942878, + "grad_norm": 7.031333923339844, + "learning_rate": 4.8406819125009524e-05, + "loss": 3.1872, + "step": 307500 + }, + { + "epoch": 0.09574628477991566, + "grad_norm": 9.475797653198242, + "learning_rate": 4.840422858700141e-05, + "loss": 3.2278, + "step": 308000 + }, + { + "epoch": 0.09590171706040254, + "grad_norm": 6.244027137756348, + "learning_rate": 4.840163804899329e-05, + "loss": 3.1307, + "step": 308500 + }, + { + "epoch": 0.09605714934088941, + "grad_norm": 7.940578460693359, + "learning_rate": 4.839904751098518e-05, + "loss": 3.167, + "step": 309000 + }, + { + "epoch": 0.09621258162137629, + "grad_norm": 7.5359907150268555, + "learning_rate": 4.8396456972977066e-05, + "loss": 3.1697, + "step": 309500 + }, + { + "epoch": 0.09636801390186317, + "grad_norm": 7.270114421844482, + "learning_rate": 4.8393866434968947e-05, + "loss": 3.2143, + "step": 310000 + }, + { + "epoch": 0.09652344618235004, + "grad_norm": 6.572207927703857, + "learning_rate": 4.8391275896960834e-05, + "loss": 3.1378, + "step": 310500 + }, + { + "epoch": 0.09667887846283692, + "grad_norm": 6.419137001037598, + "learning_rate": 4.838868535895272e-05, + "loss": 3.156, + "step": 311000 + }, + { + "epoch": 0.0968343107433238, + "grad_norm": 7.803755283355713, + "learning_rate": 4.838609482094461e-05, + "loss": 3.2003, + "step": 311500 + }, + { + "epoch": 0.09698974302381067, + "grad_norm": 22.60590362548828, + "learning_rate": 4.8383504282936495e-05, + "loss": 3.1768, + "step": 312000 + }, + { + "epoch": 0.09714517530429755, + "grad_norm": 6.504997253417969, + "learning_rate": 4.8380913744928375e-05, + "loss": 3.1753, + "step": 312500 + }, + { + "epoch": 0.09730060758478443, + "grad_norm": 6.824978828430176, + "learning_rate": 4.837832320692026e-05, + "loss": 3.1801, + "step": 313000 + }, + { + "epoch": 0.0974560398652713, + "grad_norm": 6.524799346923828, + "learning_rate": 4.837573266891215e-05, + "loss": 3.1683, + "step": 313500 + }, + { + "epoch": 0.09761147214575817, + "grad_norm": 7.29182767868042, + "learning_rate": 4.837314213090403e-05, + "loss": 3.1938, + "step": 314000 + }, + { + "epoch": 0.09776690442624505, + "grad_norm": 11.790168762207031, + "learning_rate": 4.837055159289592e-05, + "loss": 3.1764, + "step": 314500 + }, + { + "epoch": 0.09792233670673192, + "grad_norm": 7.414951324462891, + "learning_rate": 4.8367961054887804e-05, + "loss": 3.1476, + "step": 315000 + }, + { + "epoch": 0.0980777689872188, + "grad_norm": 9.613347053527832, + "learning_rate": 4.8365370516879685e-05, + "loss": 3.2337, + "step": 315500 + }, + { + "epoch": 0.09823320126770568, + "grad_norm": 8.400300025939941, + "learning_rate": 4.836277997887157e-05, + "loss": 3.2017, + "step": 316000 + }, + { + "epoch": 0.09838863354819255, + "grad_norm": 7.980452060699463, + "learning_rate": 4.836018944086346e-05, + "loss": 3.1786, + "step": 316500 + }, + { + "epoch": 0.09854406582867943, + "grad_norm": 7.0912370681762695, + "learning_rate": 4.8357598902855346e-05, + "loss": 3.1908, + "step": 317000 + }, + { + "epoch": 0.09869949810916631, + "grad_norm": 5.718499660491943, + "learning_rate": 4.8355008364847233e-05, + "loss": 3.1868, + "step": 317500 + }, + { + "epoch": 0.09885493038965318, + "grad_norm": 7.598127841949463, + "learning_rate": 4.8352417826839114e-05, + "loss": 3.2213, + "step": 318000 + }, + { + "epoch": 0.09901036267014006, + "grad_norm": 6.534087181091309, + "learning_rate": 4.8349827288831e-05, + "loss": 3.161, + "step": 318500 + }, + { + "epoch": 0.09916579495062694, + "grad_norm": 6.779377460479736, + "learning_rate": 4.834723675082289e-05, + "loss": 3.2156, + "step": 319000 + }, + { + "epoch": 0.0993212272311138, + "grad_norm": 6.944066524505615, + "learning_rate": 4.834464621281477e-05, + "loss": 3.2115, + "step": 319500 + }, + { + "epoch": 0.09947665951160069, + "grad_norm": 9.246770858764648, + "learning_rate": 4.8342055674806656e-05, + "loss": 3.1834, + "step": 320000 + }, + { + "epoch": 0.09963209179208757, + "grad_norm": 6.912715911865234, + "learning_rate": 4.833946513679854e-05, + "loss": 3.1592, + "step": 320500 + }, + { + "epoch": 0.09978752407257444, + "grad_norm": 6.04251766204834, + "learning_rate": 4.833687459879043e-05, + "loss": 3.1811, + "step": 321000 + }, + { + "epoch": 0.09994295635306132, + "grad_norm": 10.505280494689941, + "learning_rate": 4.833428406078232e-05, + "loss": 3.172, + "step": 321500 + }, + { + "epoch": 0.1000983886335482, + "grad_norm": 6.277884006500244, + "learning_rate": 4.8331693522774204e-05, + "loss": 3.2007, + "step": 322000 + }, + { + "epoch": 0.10025382091403506, + "grad_norm": 7.4872894287109375, + "learning_rate": 4.8329102984766085e-05, + "loss": 3.1792, + "step": 322500 + }, + { + "epoch": 0.10040925319452194, + "grad_norm": 12.148858070373535, + "learning_rate": 4.832651244675797e-05, + "loss": 3.1283, + "step": 323000 + }, + { + "epoch": 0.10056468547500882, + "grad_norm": 7.833829402923584, + "learning_rate": 4.832392190874985e-05, + "loss": 3.1549, + "step": 323500 + }, + { + "epoch": 0.10072011775549569, + "grad_norm": 10.688488960266113, + "learning_rate": 4.832133137074174e-05, + "loss": 3.1383, + "step": 324000 + }, + { + "epoch": 0.10087555003598257, + "grad_norm": 7.905416011810303, + "learning_rate": 4.8318740832733627e-05, + "loss": 3.1792, + "step": 324500 + }, + { + "epoch": 0.10103098231646945, + "grad_norm": 6.978979110717773, + "learning_rate": 4.831615029472551e-05, + "loss": 3.1614, + "step": 325000 + }, + { + "epoch": 0.10118641459695632, + "grad_norm": 7.31102180480957, + "learning_rate": 4.8313559756717394e-05, + "loss": 3.1861, + "step": 325500 + }, + { + "epoch": 0.1013418468774432, + "grad_norm": 7.583340644836426, + "learning_rate": 4.831096921870928e-05, + "loss": 3.2097, + "step": 326000 + }, + { + "epoch": 0.10149727915793008, + "grad_norm": 6.478161334991455, + "learning_rate": 4.830837868070117e-05, + "loss": 3.1903, + "step": 326500 + }, + { + "epoch": 0.10165271143841695, + "grad_norm": 5.136001110076904, + "learning_rate": 4.8305788142693056e-05, + "loss": 3.1548, + "step": 327000 + }, + { + "epoch": 0.10180814371890383, + "grad_norm": 28.545650482177734, + "learning_rate": 4.830319760468494e-05, + "loss": 3.1899, + "step": 327500 + }, + { + "epoch": 0.10196357599939071, + "grad_norm": 6.596538066864014, + "learning_rate": 4.830060706667682e-05, + "loss": 3.1644, + "step": 328000 + }, + { + "epoch": 0.10211900827987758, + "grad_norm": 9.070351600646973, + "learning_rate": 4.829801652866871e-05, + "loss": 3.1588, + "step": 328500 + }, + { + "epoch": 0.10227444056036446, + "grad_norm": 6.285277843475342, + "learning_rate": 4.829542599066059e-05, + "loss": 3.1519, + "step": 329000 + }, + { + "epoch": 0.10242987284085134, + "grad_norm": 7.853664875030518, + "learning_rate": 4.829283545265248e-05, + "loss": 3.1298, + "step": 329500 + }, + { + "epoch": 0.1025853051213382, + "grad_norm": 7.560462951660156, + "learning_rate": 4.8290244914644365e-05, + "loss": 3.1649, + "step": 330000 + }, + { + "epoch": 0.10274073740182509, + "grad_norm": 10.801639556884766, + "learning_rate": 4.828765437663625e-05, + "loss": 3.1882, + "step": 330500 + }, + { + "epoch": 0.10289616968231197, + "grad_norm": 7.7077484130859375, + "learning_rate": 4.828506383862814e-05, + "loss": 3.1596, + "step": 331000 + }, + { + "epoch": 0.10305160196279883, + "grad_norm": 7.235602378845215, + "learning_rate": 4.8282473300620026e-05, + "loss": 3.1196, + "step": 331500 + }, + { + "epoch": 0.10320703424328571, + "grad_norm": 6.199057102203369, + "learning_rate": 4.827988276261191e-05, + "loss": 3.2239, + "step": 332000 + }, + { + "epoch": 0.1033624665237726, + "grad_norm": 7.597100734710693, + "learning_rate": 4.8277292224603794e-05, + "loss": 3.1901, + "step": 332500 + }, + { + "epoch": 0.10351789880425946, + "grad_norm": 6.741291522979736, + "learning_rate": 4.827470168659568e-05, + "loss": 3.1846, + "step": 333000 + }, + { + "epoch": 0.10367333108474634, + "grad_norm": 12.66021728515625, + "learning_rate": 4.827211114858756e-05, + "loss": 3.1582, + "step": 333500 + }, + { + "epoch": 0.10382876336523322, + "grad_norm": 8.409788131713867, + "learning_rate": 4.826952061057945e-05, + "loss": 3.1546, + "step": 334000 + }, + { + "epoch": 0.10398419564572009, + "grad_norm": 7.118747711181641, + "learning_rate": 4.826693007257133e-05, + "loss": 3.2243, + "step": 334500 + }, + { + "epoch": 0.10413962792620697, + "grad_norm": 8.718803405761719, + "learning_rate": 4.8264339534563216e-05, + "loss": 3.2037, + "step": 335000 + }, + { + "epoch": 0.10429506020669385, + "grad_norm": 13.49921703338623, + "learning_rate": 4.82617489965551e-05, + "loss": 3.1183, + "step": 335500 + }, + { + "epoch": 0.10445049248718072, + "grad_norm": 6.537725448608398, + "learning_rate": 4.825915845854699e-05, + "loss": 3.1419, + "step": 336000 + }, + { + "epoch": 0.1046059247676676, + "grad_norm": 7.4512224197387695, + "learning_rate": 4.825656792053888e-05, + "loss": 3.1537, + "step": 336500 + }, + { + "epoch": 0.10476135704815448, + "grad_norm": 8.356022834777832, + "learning_rate": 4.8253977382530765e-05, + "loss": 3.1617, + "step": 337000 + }, + { + "epoch": 0.10491678932864135, + "grad_norm": 7.620083332061768, + "learning_rate": 4.8251386844522645e-05, + "loss": 3.1689, + "step": 337500 + }, + { + "epoch": 0.10507222160912823, + "grad_norm": 6.256901264190674, + "learning_rate": 4.824879630651453e-05, + "loss": 3.1651, + "step": 338000 + }, + { + "epoch": 0.10522765388961511, + "grad_norm": 8.462471961975098, + "learning_rate": 4.824620576850642e-05, + "loss": 3.1492, + "step": 338500 + }, + { + "epoch": 0.10538308617010197, + "grad_norm": 6.423676013946533, + "learning_rate": 4.82436152304983e-05, + "loss": 3.14, + "step": 339000 + }, + { + "epoch": 0.10553851845058886, + "grad_norm": 19.469085693359375, + "learning_rate": 4.824102469249019e-05, + "loss": 3.1812, + "step": 339500 + }, + { + "epoch": 0.10569395073107574, + "grad_norm": 7.0771403312683105, + "learning_rate": 4.8238434154482074e-05, + "loss": 3.1783, + "step": 340000 + }, + { + "epoch": 0.1058493830115626, + "grad_norm": 8.084543228149414, + "learning_rate": 4.823584361647396e-05, + "loss": 3.1899, + "step": 340500 + }, + { + "epoch": 0.10600481529204948, + "grad_norm": 10.814322471618652, + "learning_rate": 4.823325307846585e-05, + "loss": 3.2441, + "step": 341000 + }, + { + "epoch": 0.10616024757253636, + "grad_norm": 7.8129682540893555, + "learning_rate": 4.823066254045773e-05, + "loss": 3.1622, + "step": 341500 + }, + { + "epoch": 0.10631567985302323, + "grad_norm": 8.096418380737305, + "learning_rate": 4.8228072002449616e-05, + "loss": 3.1883, + "step": 342000 + }, + { + "epoch": 0.10647111213351011, + "grad_norm": 9.536809921264648, + "learning_rate": 4.82254814644415e-05, + "loss": 3.2028, + "step": 342500 + }, + { + "epoch": 0.106626544413997, + "grad_norm": 12.353974342346191, + "learning_rate": 4.8222890926433383e-05, + "loss": 3.1733, + "step": 343000 + }, + { + "epoch": 0.10678197669448386, + "grad_norm": 7.359539985656738, + "learning_rate": 4.822030038842527e-05, + "loss": 3.1964, + "step": 343500 + }, + { + "epoch": 0.10693740897497074, + "grad_norm": 7.7835693359375, + "learning_rate": 4.821770985041716e-05, + "loss": 3.1776, + "step": 344000 + }, + { + "epoch": 0.10709284125545762, + "grad_norm": 6.593421459197998, + "learning_rate": 4.821511931240904e-05, + "loss": 3.1543, + "step": 344500 + }, + { + "epoch": 0.10724827353594449, + "grad_norm": 7.068169593811035, + "learning_rate": 4.8212528774400925e-05, + "loss": 3.2159, + "step": 345000 + }, + { + "epoch": 0.10740370581643137, + "grad_norm": 6.361688137054443, + "learning_rate": 4.820993823639281e-05, + "loss": 3.1941, + "step": 345500 + }, + { + "epoch": 0.10755913809691825, + "grad_norm": 9.361106872558594, + "learning_rate": 4.82073476983847e-05, + "loss": 3.2284, + "step": 346000 + }, + { + "epoch": 0.10771457037740512, + "grad_norm": 7.417961597442627, + "learning_rate": 4.820475716037659e-05, + "loss": 3.1589, + "step": 346500 + }, + { + "epoch": 0.107870002657892, + "grad_norm": 7.255673885345459, + "learning_rate": 4.820216662236847e-05, + "loss": 3.1996, + "step": 347000 + }, + { + "epoch": 0.10802543493837888, + "grad_norm": 8.575515747070312, + "learning_rate": 4.8199576084360354e-05, + "loss": 3.1834, + "step": 347500 + }, + { + "epoch": 0.10818086721886574, + "grad_norm": 7.615197658538818, + "learning_rate": 4.819698554635224e-05, + "loss": 3.1986, + "step": 348000 + }, + { + "epoch": 0.10833629949935263, + "grad_norm": 5.621335983276367, + "learning_rate": 4.819439500834412e-05, + "loss": 3.142, + "step": 348500 + }, + { + "epoch": 0.1084917317798395, + "grad_norm": 7.26146936416626, + "learning_rate": 4.819180447033601e-05, + "loss": 3.1755, + "step": 349000 + }, + { + "epoch": 0.10864716406032637, + "grad_norm": 7.788675785064697, + "learning_rate": 4.8189213932327896e-05, + "loss": 3.193, + "step": 349500 + }, + { + "epoch": 0.10880259634081325, + "grad_norm": 8.51718521118164, + "learning_rate": 4.8186623394319776e-05, + "loss": 3.2009, + "step": 350000 + }, + { + "epoch": 0.10895802862130013, + "grad_norm": 6.52815055847168, + "learning_rate": 4.818403285631167e-05, + "loss": 3.1851, + "step": 350500 + }, + { + "epoch": 0.109113460901787, + "grad_norm": 5.680782318115234, + "learning_rate": 4.818144231830356e-05, + "loss": 3.1997, + "step": 351000 + }, + { + "epoch": 0.10926889318227388, + "grad_norm": 22.686752319335938, + "learning_rate": 4.817885178029544e-05, + "loss": 3.1629, + "step": 351500 + }, + { + "epoch": 0.10942432546276076, + "grad_norm": 8.74708366394043, + "learning_rate": 4.8176261242287325e-05, + "loss": 3.159, + "step": 352000 + }, + { + "epoch": 0.10957975774324763, + "grad_norm": 8.501091957092285, + "learning_rate": 4.8173670704279205e-05, + "loss": 3.1879, + "step": 352500 + }, + { + "epoch": 0.10973519002373451, + "grad_norm": 8.665863990783691, + "learning_rate": 4.817108016627109e-05, + "loss": 3.1685, + "step": 353000 + }, + { + "epoch": 0.10989062230422139, + "grad_norm": 5.781067371368408, + "learning_rate": 4.816848962826298e-05, + "loss": 3.2013, + "step": 353500 + }, + { + "epoch": 0.11004605458470826, + "grad_norm": 7.972797870635986, + "learning_rate": 4.816589909025486e-05, + "loss": 3.1659, + "step": 354000 + }, + { + "epoch": 0.11020148686519514, + "grad_norm": 8.035216331481934, + "learning_rate": 4.816330855224675e-05, + "loss": 3.1844, + "step": 354500 + }, + { + "epoch": 0.11035691914568202, + "grad_norm": 8.77336597442627, + "learning_rate": 4.8160718014238634e-05, + "loss": 3.16, + "step": 355000 + }, + { + "epoch": 0.11051235142616889, + "grad_norm": 9.973501205444336, + "learning_rate": 4.815812747623052e-05, + "loss": 3.151, + "step": 355500 + }, + { + "epoch": 0.11066778370665577, + "grad_norm": 21.21275520324707, + "learning_rate": 4.815553693822241e-05, + "loss": 3.1429, + "step": 356000 + }, + { + "epoch": 0.11082321598714265, + "grad_norm": 10.313750267028809, + "learning_rate": 4.8152946400214296e-05, + "loss": 3.1489, + "step": 356500 + }, + { + "epoch": 0.11097864826762951, + "grad_norm": 8.096628189086914, + "learning_rate": 4.8150355862206176e-05, + "loss": 3.1739, + "step": 357000 + }, + { + "epoch": 0.1111340805481164, + "grad_norm": 7.549122333526611, + "learning_rate": 4.8147765324198063e-05, + "loss": 3.1827, + "step": 357500 + }, + { + "epoch": 0.11128951282860328, + "grad_norm": 5.541996002197266, + "learning_rate": 4.814517478618995e-05, + "loss": 3.1592, + "step": 358000 + }, + { + "epoch": 0.11144494510909014, + "grad_norm": 6.948065757751465, + "learning_rate": 4.814258424818183e-05, + "loss": 3.2019, + "step": 358500 + }, + { + "epoch": 0.11160037738957702, + "grad_norm": 9.56619644165039, + "learning_rate": 4.813999371017372e-05, + "loss": 3.1386, + "step": 359000 + }, + { + "epoch": 0.1117558096700639, + "grad_norm": 13.557573318481445, + "learning_rate": 4.81374031721656e-05, + "loss": 3.1552, + "step": 359500 + }, + { + "epoch": 0.11191124195055077, + "grad_norm": 7.573786735534668, + "learning_rate": 4.8134812634157486e-05, + "loss": 3.1292, + "step": 360000 + }, + { + "epoch": 0.11206667423103765, + "grad_norm": 9.161781311035156, + "learning_rate": 4.813222209614938e-05, + "loss": 3.1889, + "step": 360500 + }, + { + "epoch": 0.11222210651152453, + "grad_norm": 6.09195613861084, + "learning_rate": 4.812963155814126e-05, + "loss": 3.1486, + "step": 361000 + }, + { + "epoch": 0.1123775387920114, + "grad_norm": 6.249310493469238, + "learning_rate": 4.812704102013315e-05, + "loss": 3.2133, + "step": 361500 + }, + { + "epoch": 0.11253297107249828, + "grad_norm": 7.817121982574463, + "learning_rate": 4.8124450482125034e-05, + "loss": 3.1592, + "step": 362000 + }, + { + "epoch": 0.11268840335298516, + "grad_norm": 6.628620624542236, + "learning_rate": 4.8121859944116915e-05, + "loss": 3.1472, + "step": 362500 + }, + { + "epoch": 0.11284383563347203, + "grad_norm": 7.423906326293945, + "learning_rate": 4.81192694061088e-05, + "loss": 3.1172, + "step": 363000 + }, + { + "epoch": 0.11299926791395891, + "grad_norm": 9.232403755187988, + "learning_rate": 4.811667886810069e-05, + "loss": 3.1335, + "step": 363500 + }, + { + "epoch": 0.11315470019444579, + "grad_norm": 8.58322811126709, + "learning_rate": 4.811408833009257e-05, + "loss": 3.1621, + "step": 364000 + }, + { + "epoch": 0.11331013247493266, + "grad_norm": 9.791912078857422, + "learning_rate": 4.8111497792084456e-05, + "loss": 3.1529, + "step": 364500 + }, + { + "epoch": 0.11346556475541954, + "grad_norm": 7.634219646453857, + "learning_rate": 4.8108907254076344e-05, + "loss": 3.1369, + "step": 365000 + }, + { + "epoch": 0.1136209970359064, + "grad_norm": 7.349883079528809, + "learning_rate": 4.810631671606823e-05, + "loss": 3.1838, + "step": 365500 + }, + { + "epoch": 0.11377642931639328, + "grad_norm": 12.246726989746094, + "learning_rate": 4.810372617806012e-05, + "loss": 3.1692, + "step": 366000 + }, + { + "epoch": 0.11393186159688017, + "grad_norm": 8.762874603271484, + "learning_rate": 4.8101135640052e-05, + "loss": 3.182, + "step": 366500 + }, + { + "epoch": 0.11408729387736703, + "grad_norm": 46.773624420166016, + "learning_rate": 4.8098545102043885e-05, + "loss": 3.1698, + "step": 367000 + }, + { + "epoch": 0.11424272615785391, + "grad_norm": 7.200114727020264, + "learning_rate": 4.809595456403577e-05, + "loss": 3.1913, + "step": 367500 + }, + { + "epoch": 0.1143981584383408, + "grad_norm": 18.41118621826172, + "learning_rate": 4.809336402602765e-05, + "loss": 3.1855, + "step": 368000 + }, + { + "epoch": 0.11455359071882766, + "grad_norm": 6.58657693862915, + "learning_rate": 4.809077348801954e-05, + "loss": 3.1735, + "step": 368500 + }, + { + "epoch": 0.11470902299931454, + "grad_norm": 8.318811416625977, + "learning_rate": 4.808818295001143e-05, + "loss": 3.1942, + "step": 369000 + }, + { + "epoch": 0.11486445527980142, + "grad_norm": 7.852906227111816, + "learning_rate": 4.808559241200331e-05, + "loss": 3.2129, + "step": 369500 + }, + { + "epoch": 0.11501988756028829, + "grad_norm": 21.217599868774414, + "learning_rate": 4.8083001873995195e-05, + "loss": 3.1362, + "step": 370000 + }, + { + "epoch": 0.11517531984077517, + "grad_norm": 6.146364688873291, + "learning_rate": 4.808041133598708e-05, + "loss": 3.1563, + "step": 370500 + }, + { + "epoch": 0.11533075212126205, + "grad_norm": 7.591767311096191, + "learning_rate": 4.807782079797897e-05, + "loss": 3.225, + "step": 371000 + }, + { + "epoch": 0.11548618440174892, + "grad_norm": 9.305745124816895, + "learning_rate": 4.8075230259970856e-05, + "loss": 3.1293, + "step": 371500 + }, + { + "epoch": 0.1156416166822358, + "grad_norm": 12.88448715209961, + "learning_rate": 4.807263972196274e-05, + "loss": 3.1471, + "step": 372000 + }, + { + "epoch": 0.11579704896272268, + "grad_norm": 7.799169063568115, + "learning_rate": 4.8070049183954624e-05, + "loss": 3.1488, + "step": 372500 + }, + { + "epoch": 0.11595248124320955, + "grad_norm": 8.73543930053711, + "learning_rate": 4.806745864594651e-05, + "loss": 3.1467, + "step": 373000 + }, + { + "epoch": 0.11610791352369643, + "grad_norm": 7.622913837432861, + "learning_rate": 4.806486810793839e-05, + "loss": 3.1905, + "step": 373500 + }, + { + "epoch": 0.11626334580418331, + "grad_norm": 7.705414295196533, + "learning_rate": 4.806227756993028e-05, + "loss": 3.1681, + "step": 374000 + }, + { + "epoch": 0.11641877808467017, + "grad_norm": 9.774781227111816, + "learning_rate": 4.8059687031922166e-05, + "loss": 3.1661, + "step": 374500 + }, + { + "epoch": 0.11657421036515705, + "grad_norm": 9.945212364196777, + "learning_rate": 4.805709649391405e-05, + "loss": 3.1418, + "step": 375000 + }, + { + "epoch": 0.11672964264564394, + "grad_norm": 8.473309516906738, + "learning_rate": 4.805450595590594e-05, + "loss": 3.1217, + "step": 375500 + }, + { + "epoch": 0.1168850749261308, + "grad_norm": 7.045749187469482, + "learning_rate": 4.805191541789783e-05, + "loss": 3.2105, + "step": 376000 + }, + { + "epoch": 0.11704050720661768, + "grad_norm": 6.55014705657959, + "learning_rate": 4.804932487988971e-05, + "loss": 3.1595, + "step": 376500 + }, + { + "epoch": 0.11719593948710456, + "grad_norm": 8.131903648376465, + "learning_rate": 4.8046734341881595e-05, + "loss": 3.1669, + "step": 377000 + }, + { + "epoch": 0.11735137176759143, + "grad_norm": 9.034210205078125, + "learning_rate": 4.8044143803873475e-05, + "loss": 3.1973, + "step": 377500 + }, + { + "epoch": 0.11750680404807831, + "grad_norm": 6.000582695007324, + "learning_rate": 4.804155326586536e-05, + "loss": 3.1761, + "step": 378000 + }, + { + "epoch": 0.11766223632856519, + "grad_norm": 6.36012601852417, + "learning_rate": 4.803896272785725e-05, + "loss": 3.1956, + "step": 378500 + }, + { + "epoch": 0.11781766860905206, + "grad_norm": 7.2952799797058105, + "learning_rate": 4.803637218984913e-05, + "loss": 3.1876, + "step": 379000 + }, + { + "epoch": 0.11797310088953894, + "grad_norm": 7.499868392944336, + "learning_rate": 4.803378165184102e-05, + "loss": 3.1162, + "step": 379500 + }, + { + "epoch": 0.11812853317002582, + "grad_norm": 15.005781173706055, + "learning_rate": 4.8031191113832904e-05, + "loss": 3.1548, + "step": 380000 + }, + { + "epoch": 0.11828396545051269, + "grad_norm": 6.821253776550293, + "learning_rate": 4.802860057582479e-05, + "loss": 3.1711, + "step": 380500 + }, + { + "epoch": 0.11843939773099957, + "grad_norm": 6.320420742034912, + "learning_rate": 4.802601003781668e-05, + "loss": 3.1941, + "step": 381000 + }, + { + "epoch": 0.11859483001148645, + "grad_norm": 7.9406023025512695, + "learning_rate": 4.8023419499808565e-05, + "loss": 3.188, + "step": 381500 + }, + { + "epoch": 0.11875026229197332, + "grad_norm": 9.333666801452637, + "learning_rate": 4.8020828961800446e-05, + "loss": 3.1638, + "step": 382000 + }, + { + "epoch": 0.1189056945724602, + "grad_norm": 8.758200645446777, + "learning_rate": 4.801823842379233e-05, + "loss": 3.1952, + "step": 382500 + }, + { + "epoch": 0.11906112685294708, + "grad_norm": 6.276987552642822, + "learning_rate": 4.8015647885784213e-05, + "loss": 3.2182, + "step": 383000 + }, + { + "epoch": 0.11921655913343394, + "grad_norm": 8.003066062927246, + "learning_rate": 4.80130573477761e-05, + "loss": 3.182, + "step": 383500 + }, + { + "epoch": 0.11937199141392082, + "grad_norm": 7.19912576675415, + "learning_rate": 4.801046680976799e-05, + "loss": 3.1905, + "step": 384000 + }, + { + "epoch": 0.1195274236944077, + "grad_norm": 11.26898193359375, + "learning_rate": 4.8007876271759875e-05, + "loss": 3.1496, + "step": 384500 + }, + { + "epoch": 0.11968285597489457, + "grad_norm": 23.264074325561523, + "learning_rate": 4.800528573375176e-05, + "loss": 3.1304, + "step": 385000 + }, + { + "epoch": 0.11983828825538145, + "grad_norm": 8.069762229919434, + "learning_rate": 4.800269519574365e-05, + "loss": 3.1539, + "step": 385500 + }, + { + "epoch": 0.11999372053586833, + "grad_norm": 8.633646011352539, + "learning_rate": 4.800010465773553e-05, + "loss": 3.2005, + "step": 386000 + }, + { + "epoch": 0.1201491528163552, + "grad_norm": 6.946836948394775, + "learning_rate": 4.799751411972742e-05, + "loss": 3.1523, + "step": 386500 + }, + { + "epoch": 0.12030458509684208, + "grad_norm": 7.571057319641113, + "learning_rate": 4.7994923581719304e-05, + "loss": 3.183, + "step": 387000 + }, + { + "epoch": 0.12046001737732896, + "grad_norm": 7.3459062576293945, + "learning_rate": 4.7992333043711184e-05, + "loss": 3.1307, + "step": 387500 + }, + { + "epoch": 0.12061544965781583, + "grad_norm": 8.511014938354492, + "learning_rate": 4.798974250570307e-05, + "loss": 3.1431, + "step": 388000 + }, + { + "epoch": 0.12077088193830271, + "grad_norm": 6.055860996246338, + "learning_rate": 4.798715196769495e-05, + "loss": 3.1602, + "step": 388500 + }, + { + "epoch": 0.12092631421878959, + "grad_norm": 9.180620193481445, + "learning_rate": 4.798456142968684e-05, + "loss": 3.1504, + "step": 389000 + }, + { + "epoch": 0.12108174649927646, + "grad_norm": 7.223909378051758, + "learning_rate": 4.7981970891678726e-05, + "loss": 3.1773, + "step": 389500 + }, + { + "epoch": 0.12123717877976334, + "grad_norm": 10.11792278289795, + "learning_rate": 4.797938035367061e-05, + "loss": 3.1888, + "step": 390000 + }, + { + "epoch": 0.12139261106025022, + "grad_norm": 9.563331604003906, + "learning_rate": 4.79767898156625e-05, + "loss": 3.1977, + "step": 390500 + }, + { + "epoch": 0.12154804334073709, + "grad_norm": 6.217080593109131, + "learning_rate": 4.797419927765439e-05, + "loss": 3.151, + "step": 391000 + }, + { + "epoch": 0.12170347562122397, + "grad_norm": 7.912075519561768, + "learning_rate": 4.797160873964627e-05, + "loss": 3.1438, + "step": 391500 + }, + { + "epoch": 0.12185890790171085, + "grad_norm": 7.800711631774902, + "learning_rate": 4.7969018201638155e-05, + "loss": 3.195, + "step": 392000 + }, + { + "epoch": 0.12201434018219771, + "grad_norm": 18.523540496826172, + "learning_rate": 4.796642766363004e-05, + "loss": 3.1582, + "step": 392500 + }, + { + "epoch": 0.1221697724626846, + "grad_norm": 7.865385055541992, + "learning_rate": 4.796383712562192e-05, + "loss": 3.1554, + "step": 393000 + }, + { + "epoch": 0.12232520474317148, + "grad_norm": 11.4461088180542, + "learning_rate": 4.796124658761381e-05, + "loss": 3.1299, + "step": 393500 + }, + { + "epoch": 0.12248063702365834, + "grad_norm": 8.190375328063965, + "learning_rate": 4.79586560496057e-05, + "loss": 3.123, + "step": 394000 + }, + { + "epoch": 0.12263606930414522, + "grad_norm": 5.313860893249512, + "learning_rate": 4.7956065511597584e-05, + "loss": 3.1568, + "step": 394500 + }, + { + "epoch": 0.1227915015846321, + "grad_norm": 9.012430191040039, + "learning_rate": 4.795347497358947e-05, + "loss": 3.1759, + "step": 395000 + }, + { + "epoch": 0.12294693386511897, + "grad_norm": 5.486538887023926, + "learning_rate": 4.795088443558135e-05, + "loss": 3.0969, + "step": 395500 + }, + { + "epoch": 0.12310236614560585, + "grad_norm": 8.740150451660156, + "learning_rate": 4.794829389757324e-05, + "loss": 3.1474, + "step": 396000 + }, + { + "epoch": 0.12325779842609273, + "grad_norm": 9.349884033203125, + "learning_rate": 4.7945703359565126e-05, + "loss": 3.1599, + "step": 396500 + }, + { + "epoch": 0.1234132307065796, + "grad_norm": 8.769721031188965, + "learning_rate": 4.7943112821557006e-05, + "loss": 3.1855, + "step": 397000 + }, + { + "epoch": 0.12356866298706648, + "grad_norm": 7.8129096031188965, + "learning_rate": 4.7940522283548893e-05, + "loss": 3.2108, + "step": 397500 + }, + { + "epoch": 0.12372409526755336, + "grad_norm": 10.217227935791016, + "learning_rate": 4.793793174554078e-05, + "loss": 3.152, + "step": 398000 + }, + { + "epoch": 0.12387952754804023, + "grad_norm": 9.240826606750488, + "learning_rate": 4.793534120753266e-05, + "loss": 3.1569, + "step": 398500 + }, + { + "epoch": 0.12403495982852711, + "grad_norm": 6.651777267456055, + "learning_rate": 4.793275066952455e-05, + "loss": 3.142, + "step": 399000 + }, + { + "epoch": 0.12419039210901399, + "grad_norm": 6.177861213684082, + "learning_rate": 4.7930160131516435e-05, + "loss": 3.1371, + "step": 399500 + }, + { + "epoch": 0.12434582438950086, + "grad_norm": 9.802507400512695, + "learning_rate": 4.792756959350832e-05, + "loss": 3.1637, + "step": 400000 + }, + { + "epoch": 0.12450125666998774, + "grad_norm": 7.092039585113525, + "learning_rate": 4.792497905550021e-05, + "loss": 3.1472, + "step": 400500 + }, + { + "epoch": 0.12465668895047462, + "grad_norm": 6.823572635650635, + "learning_rate": 4.792238851749209e-05, + "loss": 3.1623, + "step": 401000 + }, + { + "epoch": 0.12481212123096148, + "grad_norm": 18.725521087646484, + "learning_rate": 4.791979797948398e-05, + "loss": 3.2119, + "step": 401500 + }, + { + "epoch": 0.12496755351144836, + "grad_norm": 8.051300048828125, + "learning_rate": 4.7917207441475864e-05, + "loss": 3.1563, + "step": 402000 + }, + { + "epoch": 0.12512298579193523, + "grad_norm": 12.140485763549805, + "learning_rate": 4.7914616903467745e-05, + "loss": 3.1448, + "step": 402500 + }, + { + "epoch": 0.12527841807242213, + "grad_norm": 6.481038570404053, + "learning_rate": 4.791202636545963e-05, + "loss": 3.1545, + "step": 403000 + }, + { + "epoch": 0.125433850352909, + "grad_norm": 11.00004768371582, + "learning_rate": 4.790943582745152e-05, + "loss": 3.1447, + "step": 403500 + }, + { + "epoch": 0.12558928263339586, + "grad_norm": 8.333738327026367, + "learning_rate": 4.7906845289443406e-05, + "loss": 3.1352, + "step": 404000 + }, + { + "epoch": 0.12574471491388275, + "grad_norm": 7.298405170440674, + "learning_rate": 4.790425475143529e-05, + "loss": 3.151, + "step": 404500 + }, + { + "epoch": 0.12590014719436962, + "grad_norm": 6.948157787322998, + "learning_rate": 4.790166421342718e-05, + "loss": 3.2138, + "step": 405000 + }, + { + "epoch": 0.1260555794748565, + "grad_norm": 8.843202590942383, + "learning_rate": 4.789907367541906e-05, + "loss": 3.1458, + "step": 405500 + }, + { + "epoch": 0.12621101175534338, + "grad_norm": 8.532846450805664, + "learning_rate": 4.789648313741095e-05, + "loss": 3.1521, + "step": 406000 + }, + { + "epoch": 0.12636644403583025, + "grad_norm": 5.986678123474121, + "learning_rate": 4.7893892599402835e-05, + "loss": 3.1744, + "step": 406500 + }, + { + "epoch": 0.12652187631631712, + "grad_norm": 11.4813814163208, + "learning_rate": 4.7891302061394715e-05, + "loss": 3.1573, + "step": 407000 + }, + { + "epoch": 0.126677308596804, + "grad_norm": 9.13705825805664, + "learning_rate": 4.78887115233866e-05, + "loss": 3.1337, + "step": 407500 + }, + { + "epoch": 0.12683274087729088, + "grad_norm": 8.003881454467773, + "learning_rate": 4.788612098537848e-05, + "loss": 3.1384, + "step": 408000 + }, + { + "epoch": 0.12698817315777775, + "grad_norm": 10.711318969726562, + "learning_rate": 4.788353044737037e-05, + "loss": 3.1609, + "step": 408500 + }, + { + "epoch": 0.12714360543826464, + "grad_norm": 6.5018744468688965, + "learning_rate": 4.788093990936226e-05, + "loss": 3.1665, + "step": 409000 + }, + { + "epoch": 0.1272990377187515, + "grad_norm": 7.376734733581543, + "learning_rate": 4.7878349371354144e-05, + "loss": 3.107, + "step": 409500 + }, + { + "epoch": 0.12745446999923837, + "grad_norm": 50.13873291015625, + "learning_rate": 4.787575883334603e-05, + "loss": 3.1939, + "step": 410000 + }, + { + "epoch": 0.12760990227972527, + "grad_norm": 6.739749908447266, + "learning_rate": 4.787316829533792e-05, + "loss": 3.1247, + "step": 410500 + }, + { + "epoch": 0.12776533456021213, + "grad_norm": 32.82851791381836, + "learning_rate": 4.78705777573298e-05, + "loss": 3.0931, + "step": 411000 + }, + { + "epoch": 0.127920766840699, + "grad_norm": 7.24578332901001, + "learning_rate": 4.7867987219321686e-05, + "loss": 3.1496, + "step": 411500 + }, + { + "epoch": 0.1280761991211859, + "grad_norm": 7.607905864715576, + "learning_rate": 4.7865396681313573e-05, + "loss": 3.1536, + "step": 412000 + }, + { + "epoch": 0.12823163140167276, + "grad_norm": 7.080807209014893, + "learning_rate": 4.7862806143305454e-05, + "loss": 3.1401, + "step": 412500 + }, + { + "epoch": 0.12838706368215963, + "grad_norm": 7.293042182922363, + "learning_rate": 4.786021560529734e-05, + "loss": 3.1693, + "step": 413000 + }, + { + "epoch": 0.12854249596264652, + "grad_norm": 6.508268356323242, + "learning_rate": 4.785762506728922e-05, + "loss": 3.0903, + "step": 413500 + }, + { + "epoch": 0.1286979282431334, + "grad_norm": 7.081932067871094, + "learning_rate": 4.7855034529281115e-05, + "loss": 3.1422, + "step": 414000 + }, + { + "epoch": 0.12885336052362026, + "grad_norm": 8.0665864944458, + "learning_rate": 4.7852443991273e-05, + "loss": 3.1341, + "step": 414500 + }, + { + "epoch": 0.12900879280410715, + "grad_norm": 7.572020053863525, + "learning_rate": 4.784985345326488e-05, + "loss": 3.1657, + "step": 415000 + }, + { + "epoch": 0.12916422508459402, + "grad_norm": 11.966236114501953, + "learning_rate": 4.784726291525677e-05, + "loss": 3.1577, + "step": 415500 + }, + { + "epoch": 0.1293196573650809, + "grad_norm": 8.962944984436035, + "learning_rate": 4.784467237724866e-05, + "loss": 3.1406, + "step": 416000 + }, + { + "epoch": 0.12947508964556778, + "grad_norm": 9.291458129882812, + "learning_rate": 4.784208183924054e-05, + "loss": 3.1191, + "step": 416500 + }, + { + "epoch": 0.12963052192605465, + "grad_norm": 8.803328514099121, + "learning_rate": 4.7839491301232425e-05, + "loss": 3.1674, + "step": 417000 + }, + { + "epoch": 0.12978595420654152, + "grad_norm": 7.731800079345703, + "learning_rate": 4.783690076322431e-05, + "loss": 3.1425, + "step": 417500 + }, + { + "epoch": 0.1299413864870284, + "grad_norm": 19.87569808959961, + "learning_rate": 4.783431022521619e-05, + "loss": 3.1546, + "step": 418000 + }, + { + "epoch": 0.13009681876751528, + "grad_norm": 7.073700904846191, + "learning_rate": 4.783171968720808e-05, + "loss": 3.168, + "step": 418500 + }, + { + "epoch": 0.13025225104800214, + "grad_norm": 8.11892318725586, + "learning_rate": 4.7829129149199966e-05, + "loss": 3.165, + "step": 419000 + }, + { + "epoch": 0.13040768332848904, + "grad_norm": 8.460139274597168, + "learning_rate": 4.7826538611191854e-05, + "loss": 3.1198, + "step": 419500 + }, + { + "epoch": 0.1305631156089759, + "grad_norm": 8.19758415222168, + "learning_rate": 4.782394807318374e-05, + "loss": 3.1702, + "step": 420000 + }, + { + "epoch": 0.13071854788946277, + "grad_norm": 11.16972541809082, + "learning_rate": 4.782135753517562e-05, + "loss": 3.1492, + "step": 420500 + }, + { + "epoch": 0.13087398016994967, + "grad_norm": 8.225409507751465, + "learning_rate": 4.781876699716751e-05, + "loss": 3.1497, + "step": 421000 + }, + { + "epoch": 0.13102941245043653, + "grad_norm": 11.168668746948242, + "learning_rate": 4.7816176459159395e-05, + "loss": 3.2032, + "step": 421500 + }, + { + "epoch": 0.1311848447309234, + "grad_norm": 6.280300617218018, + "learning_rate": 4.7813585921151276e-05, + "loss": 3.1698, + "step": 422000 + }, + { + "epoch": 0.1313402770114103, + "grad_norm": 8.3519287109375, + "learning_rate": 4.781099538314316e-05, + "loss": 3.1795, + "step": 422500 + }, + { + "epoch": 0.13149570929189716, + "grad_norm": 6.673651695251465, + "learning_rate": 4.780840484513505e-05, + "loss": 3.1497, + "step": 423000 + }, + { + "epoch": 0.13165114157238403, + "grad_norm": 7.760652542114258, + "learning_rate": 4.780581430712693e-05, + "loss": 3.179, + "step": 423500 + }, + { + "epoch": 0.13180657385287092, + "grad_norm": 7.3829450607299805, + "learning_rate": 4.7803223769118824e-05, + "loss": 3.1727, + "step": 424000 + }, + { + "epoch": 0.1319620061333578, + "grad_norm": 34.79305648803711, + "learning_rate": 4.780063323111071e-05, + "loss": 3.1956, + "step": 424500 + }, + { + "epoch": 0.13211743841384466, + "grad_norm": 7.604132652282715, + "learning_rate": 4.779804269310259e-05, + "loss": 3.1604, + "step": 425000 + }, + { + "epoch": 0.13227287069433155, + "grad_norm": 9.164163589477539, + "learning_rate": 4.779545215509448e-05, + "loss": 3.1348, + "step": 425500 + }, + { + "epoch": 0.13242830297481842, + "grad_norm": 8.354143142700195, + "learning_rate": 4.779286161708636e-05, + "loss": 3.1748, + "step": 426000 + }, + { + "epoch": 0.13258373525530529, + "grad_norm": 7.475255489349365, + "learning_rate": 4.779027107907825e-05, + "loss": 3.1764, + "step": 426500 + }, + { + "epoch": 0.13273916753579218, + "grad_norm": 10.992897987365723, + "learning_rate": 4.7787680541070134e-05, + "loss": 3.1169, + "step": 427000 + }, + { + "epoch": 0.13289459981627905, + "grad_norm": 7.188453197479248, + "learning_rate": 4.7785090003062014e-05, + "loss": 3.1377, + "step": 427500 + }, + { + "epoch": 0.1330500320967659, + "grad_norm": 6.702340126037598, + "learning_rate": 4.77824994650539e-05, + "loss": 3.2123, + "step": 428000 + }, + { + "epoch": 0.1332054643772528, + "grad_norm": 8.602375984191895, + "learning_rate": 4.777990892704579e-05, + "loss": 3.1426, + "step": 428500 + }, + { + "epoch": 0.13336089665773967, + "grad_norm": 8.242195129394531, + "learning_rate": 4.7777318389037676e-05, + "loss": 3.1912, + "step": 429000 + }, + { + "epoch": 0.13351632893822654, + "grad_norm": 8.670475006103516, + "learning_rate": 4.777472785102956e-05, + "loss": 3.1361, + "step": 429500 + }, + { + "epoch": 0.13367176121871344, + "grad_norm": 7.025065898895264, + "learning_rate": 4.777213731302145e-05, + "loss": 3.1061, + "step": 430000 + }, + { + "epoch": 0.1338271934992003, + "grad_norm": 7.825850486755371, + "learning_rate": 4.776954677501333e-05, + "loss": 3.1595, + "step": 430500 + }, + { + "epoch": 0.13398262577968717, + "grad_norm": 8.039069175720215, + "learning_rate": 4.776695623700522e-05, + "loss": 3.162, + "step": 431000 + }, + { + "epoch": 0.13413805806017406, + "grad_norm": 6.535322666168213, + "learning_rate": 4.77643656989971e-05, + "loss": 3.1724, + "step": 431500 + }, + { + "epoch": 0.13429349034066093, + "grad_norm": 8.639843940734863, + "learning_rate": 4.7761775160988985e-05, + "loss": 3.1191, + "step": 432000 + }, + { + "epoch": 0.1344489226211478, + "grad_norm": 6.753958225250244, + "learning_rate": 4.775918462298087e-05, + "loss": 3.1538, + "step": 432500 + }, + { + "epoch": 0.1346043549016347, + "grad_norm": 7.792910099029541, + "learning_rate": 4.775659408497275e-05, + "loss": 3.1538, + "step": 433000 + }, + { + "epoch": 0.13475978718212156, + "grad_norm": 7.700492858886719, + "learning_rate": 4.775400354696464e-05, + "loss": 3.1521, + "step": 433500 + }, + { + "epoch": 0.13491521946260843, + "grad_norm": 8.914928436279297, + "learning_rate": 4.7751413008956534e-05, + "loss": 3.1206, + "step": 434000 + }, + { + "epoch": 0.13507065174309532, + "grad_norm": 7.788517475128174, + "learning_rate": 4.7748822470948414e-05, + "loss": 3.1277, + "step": 434500 + }, + { + "epoch": 0.1352260840235822, + "grad_norm": 15.724071502685547, + "learning_rate": 4.77462319329403e-05, + "loss": 3.1834, + "step": 435000 + }, + { + "epoch": 0.13538151630406906, + "grad_norm": 7.667590618133545, + "learning_rate": 4.774364139493219e-05, + "loss": 3.1452, + "step": 435500 + }, + { + "epoch": 0.13553694858455595, + "grad_norm": 6.0579514503479, + "learning_rate": 4.774105085692407e-05, + "loss": 3.1111, + "step": 436000 + }, + { + "epoch": 0.13569238086504282, + "grad_norm": 9.851696014404297, + "learning_rate": 4.7738460318915956e-05, + "loss": 3.1412, + "step": 436500 + }, + { + "epoch": 0.13584781314552968, + "grad_norm": 9.10062026977539, + "learning_rate": 4.7735869780907836e-05, + "loss": 3.1414, + "step": 437000 + }, + { + "epoch": 0.13600324542601658, + "grad_norm": 6.241948127746582, + "learning_rate": 4.773327924289972e-05, + "loss": 3.1538, + "step": 437500 + }, + { + "epoch": 0.13615867770650344, + "grad_norm": 7.406065464019775, + "learning_rate": 4.773068870489161e-05, + "loss": 3.1743, + "step": 438000 + }, + { + "epoch": 0.1363141099869903, + "grad_norm": 7.836674690246582, + "learning_rate": 4.77280981668835e-05, + "loss": 3.1346, + "step": 438500 + }, + { + "epoch": 0.1364695422674772, + "grad_norm": 7.677957534790039, + "learning_rate": 4.7725507628875385e-05, + "loss": 3.1777, + "step": 439000 + }, + { + "epoch": 0.13662497454796407, + "grad_norm": 8.180293083190918, + "learning_rate": 4.772291709086727e-05, + "loss": 3.1301, + "step": 439500 + }, + { + "epoch": 0.13678040682845094, + "grad_norm": 17.60361099243164, + "learning_rate": 4.772032655285915e-05, + "loss": 3.1399, + "step": 440000 + }, + { + "epoch": 0.13693583910893783, + "grad_norm": 5.802761554718018, + "learning_rate": 4.771773601485104e-05, + "loss": 3.1357, + "step": 440500 + }, + { + "epoch": 0.1370912713894247, + "grad_norm": 9.97792911529541, + "learning_rate": 4.771514547684293e-05, + "loss": 3.1397, + "step": 441000 + }, + { + "epoch": 0.13724670366991157, + "grad_norm": 7.635434150695801, + "learning_rate": 4.771255493883481e-05, + "loss": 3.1503, + "step": 441500 + }, + { + "epoch": 0.13740213595039846, + "grad_norm": 7.416116714477539, + "learning_rate": 4.7709964400826694e-05, + "loss": 3.1051, + "step": 442000 + }, + { + "epoch": 0.13755756823088533, + "grad_norm": 6.1030354499816895, + "learning_rate": 4.770737386281858e-05, + "loss": 3.0916, + "step": 442500 + }, + { + "epoch": 0.1377130005113722, + "grad_norm": 9.621749877929688, + "learning_rate": 4.770478332481046e-05, + "loss": 3.1491, + "step": 443000 + }, + { + "epoch": 0.1378684327918591, + "grad_norm": 6.204534530639648, + "learning_rate": 4.770219278680235e-05, + "loss": 3.1603, + "step": 443500 + }, + { + "epoch": 0.13802386507234596, + "grad_norm": 7.788522720336914, + "learning_rate": 4.7699602248794236e-05, + "loss": 3.1471, + "step": 444000 + }, + { + "epoch": 0.13817929735283282, + "grad_norm": 6.3051371574401855, + "learning_rate": 4.769701171078612e-05, + "loss": 3.1361, + "step": 444500 + }, + { + "epoch": 0.13833472963331972, + "grad_norm": 5.264625549316406, + "learning_rate": 4.769442117277801e-05, + "loss": 3.1514, + "step": 445000 + }, + { + "epoch": 0.1384901619138066, + "grad_norm": 8.13842487335205, + "learning_rate": 4.769183063476989e-05, + "loss": 3.1468, + "step": 445500 + }, + { + "epoch": 0.13864559419429345, + "grad_norm": 7.473953723907471, + "learning_rate": 4.768924009676178e-05, + "loss": 3.1476, + "step": 446000 + }, + { + "epoch": 0.13880102647478035, + "grad_norm": 12.467001914978027, + "learning_rate": 4.7686649558753665e-05, + "loss": 3.13, + "step": 446500 + }, + { + "epoch": 0.13895645875526721, + "grad_norm": 8.2540283203125, + "learning_rate": 4.7684059020745545e-05, + "loss": 3.1091, + "step": 447000 + }, + { + "epoch": 0.13911189103575408, + "grad_norm": 9.483057975769043, + "learning_rate": 4.768146848273743e-05, + "loss": 3.0823, + "step": 447500 + }, + { + "epoch": 0.13926732331624098, + "grad_norm": 7.416504859924316, + "learning_rate": 4.767887794472932e-05, + "loss": 3.1301, + "step": 448000 + }, + { + "epoch": 0.13942275559672784, + "grad_norm": 9.564129829406738, + "learning_rate": 4.767628740672121e-05, + "loss": 3.2077, + "step": 448500 + }, + { + "epoch": 0.1395781878772147, + "grad_norm": 7.16086483001709, + "learning_rate": 4.7673696868713094e-05, + "loss": 3.1219, + "step": 449000 + }, + { + "epoch": 0.1397336201577016, + "grad_norm": 7.281047344207764, + "learning_rate": 4.7671106330704974e-05, + "loss": 3.0804, + "step": 449500 + }, + { + "epoch": 0.13988905243818847, + "grad_norm": 8.276442527770996, + "learning_rate": 4.766851579269686e-05, + "loss": 3.1412, + "step": 450000 + }, + { + "epoch": 0.14004448471867534, + "grad_norm": 7.974361896514893, + "learning_rate": 4.766592525468875e-05, + "loss": 3.161, + "step": 450500 + }, + { + "epoch": 0.14019991699916223, + "grad_norm": 12.179359436035156, + "learning_rate": 4.766333471668063e-05, + "loss": 3.1262, + "step": 451000 + }, + { + "epoch": 0.1403553492796491, + "grad_norm": 20.679433822631836, + "learning_rate": 4.7660744178672516e-05, + "loss": 3.1642, + "step": 451500 + }, + { + "epoch": 0.14051078156013597, + "grad_norm": 6.92616081237793, + "learning_rate": 4.7658153640664403e-05, + "loss": 3.1276, + "step": 452000 + }, + { + "epoch": 0.14066621384062286, + "grad_norm": 8.531267166137695, + "learning_rate": 4.7655563102656284e-05, + "loss": 3.1393, + "step": 452500 + }, + { + "epoch": 0.14082164612110973, + "grad_norm": 5.695104122161865, + "learning_rate": 4.765297256464817e-05, + "loss": 3.1592, + "step": 453000 + }, + { + "epoch": 0.1409770784015966, + "grad_norm": 7.331014156341553, + "learning_rate": 4.765038202664006e-05, + "loss": 3.1258, + "step": 453500 + }, + { + "epoch": 0.1411325106820835, + "grad_norm": 11.13646125793457, + "learning_rate": 4.7647791488631945e-05, + "loss": 3.1557, + "step": 454000 + }, + { + "epoch": 0.14128794296257036, + "grad_norm": 4.1918535232543945, + "learning_rate": 4.764520095062383e-05, + "loss": 3.1441, + "step": 454500 + }, + { + "epoch": 0.14144337524305722, + "grad_norm": 7.496818542480469, + "learning_rate": 4.764261041261571e-05, + "loss": 3.1889, + "step": 455000 + }, + { + "epoch": 0.14159880752354412, + "grad_norm": 8.042387008666992, + "learning_rate": 4.76400198746076e-05, + "loss": 3.1773, + "step": 455500 + }, + { + "epoch": 0.14175423980403098, + "grad_norm": 7.501043319702148, + "learning_rate": 4.763742933659949e-05, + "loss": 3.1038, + "step": 456000 + }, + { + "epoch": 0.14190967208451785, + "grad_norm": 8.154942512512207, + "learning_rate": 4.763483879859137e-05, + "loss": 3.1347, + "step": 456500 + }, + { + "epoch": 0.14206510436500475, + "grad_norm": 7.898660182952881, + "learning_rate": 4.7632248260583255e-05, + "loss": 3.1548, + "step": 457000 + }, + { + "epoch": 0.1422205366454916, + "grad_norm": 5.75996208190918, + "learning_rate": 4.762965772257514e-05, + "loss": 3.1394, + "step": 457500 + }, + { + "epoch": 0.14237596892597848, + "grad_norm": 12.837496757507324, + "learning_rate": 4.762706718456703e-05, + "loss": 3.1295, + "step": 458000 + }, + { + "epoch": 0.14253140120646537, + "grad_norm": 21.744964599609375, + "learning_rate": 4.7624476646558916e-05, + "loss": 3.1234, + "step": 458500 + }, + { + "epoch": 0.14268683348695224, + "grad_norm": 7.433263301849365, + "learning_rate": 4.76218861085508e-05, + "loss": 3.147, + "step": 459000 + }, + { + "epoch": 0.1428422657674391, + "grad_norm": 6.558787822723389, + "learning_rate": 4.7619295570542684e-05, + "loss": 3.1762, + "step": 459500 + }, + { + "epoch": 0.142997698047926, + "grad_norm": 7.333337306976318, + "learning_rate": 4.761670503253457e-05, + "loss": 3.183, + "step": 460000 + }, + { + "epoch": 0.14315313032841287, + "grad_norm": 7.508825302124023, + "learning_rate": 4.761411449452646e-05, + "loss": 3.1536, + "step": 460500 + }, + { + "epoch": 0.14330856260889974, + "grad_norm": 6.2373366355896, + "learning_rate": 4.761152395651834e-05, + "loss": 3.0914, + "step": 461000 + }, + { + "epoch": 0.14346399488938663, + "grad_norm": 14.875862121582031, + "learning_rate": 4.7608933418510225e-05, + "loss": 3.1478, + "step": 461500 + }, + { + "epoch": 0.1436194271698735, + "grad_norm": 7.819972038269043, + "learning_rate": 4.7606342880502106e-05, + "loss": 3.1358, + "step": 462000 + }, + { + "epoch": 0.14377485945036036, + "grad_norm": 8.051518440246582, + "learning_rate": 4.760375234249399e-05, + "loss": 3.1739, + "step": 462500 + }, + { + "epoch": 0.14393029173084726, + "grad_norm": 6.175804138183594, + "learning_rate": 4.760116180448588e-05, + "loss": 3.1254, + "step": 463000 + }, + { + "epoch": 0.14408572401133413, + "grad_norm": 7.319088935852051, + "learning_rate": 4.759857126647777e-05, + "loss": 3.1767, + "step": 463500 + }, + { + "epoch": 0.144241156291821, + "grad_norm": 8.837154388427734, + "learning_rate": 4.7595980728469654e-05, + "loss": 3.1557, + "step": 464000 + }, + { + "epoch": 0.1443965885723079, + "grad_norm": 18.79387855529785, + "learning_rate": 4.759339019046154e-05, + "loss": 3.2073, + "step": 464500 + }, + { + "epoch": 0.14455202085279475, + "grad_norm": 7.559935569763184, + "learning_rate": 4.759079965245342e-05, + "loss": 3.1161, + "step": 465000 + }, + { + "epoch": 0.14470745313328162, + "grad_norm": 8.655670166015625, + "learning_rate": 4.758820911444531e-05, + "loss": 3.1786, + "step": 465500 + }, + { + "epoch": 0.1448628854137685, + "grad_norm": 9.032782554626465, + "learning_rate": 4.7585618576437196e-05, + "loss": 3.1147, + "step": 466000 + }, + { + "epoch": 0.14501831769425538, + "grad_norm": 9.336148262023926, + "learning_rate": 4.758302803842908e-05, + "loss": 3.1458, + "step": 466500 + }, + { + "epoch": 0.14517374997474225, + "grad_norm": 8.539565086364746, + "learning_rate": 4.7580437500420964e-05, + "loss": 3.1301, + "step": 467000 + }, + { + "epoch": 0.14532918225522912, + "grad_norm": 7.320023536682129, + "learning_rate": 4.757784696241285e-05, + "loss": 3.164, + "step": 467500 + }, + { + "epoch": 0.145484614535716, + "grad_norm": 9.050804138183594, + "learning_rate": 4.757525642440474e-05, + "loss": 3.1469, + "step": 468000 + }, + { + "epoch": 0.14564004681620288, + "grad_norm": 8.79496955871582, + "learning_rate": 4.7572665886396625e-05, + "loss": 3.0933, + "step": 468500 + }, + { + "epoch": 0.14579547909668975, + "grad_norm": 8.964601516723633, + "learning_rate": 4.7570075348388506e-05, + "loss": 3.1292, + "step": 469000 + }, + { + "epoch": 0.14595091137717664, + "grad_norm": 6.9062700271606445, + "learning_rate": 4.756748481038039e-05, + "loss": 3.148, + "step": 469500 + }, + { + "epoch": 0.1461063436576635, + "grad_norm": 6.746646404266357, + "learning_rate": 4.756489427237228e-05, + "loss": 3.0845, + "step": 470000 + }, + { + "epoch": 0.14626177593815037, + "grad_norm": 15.076488494873047, + "learning_rate": 4.756230373436416e-05, + "loss": 3.1341, + "step": 470500 + }, + { + "epoch": 0.14641720821863727, + "grad_norm": 5.707334518432617, + "learning_rate": 4.755971319635605e-05, + "loss": 3.1574, + "step": 471000 + }, + { + "epoch": 0.14657264049912413, + "grad_norm": 6.047316551208496, + "learning_rate": 4.7557122658347935e-05, + "loss": 3.1318, + "step": 471500 + }, + { + "epoch": 0.146728072779611, + "grad_norm": 7.461501121520996, + "learning_rate": 4.7554532120339815e-05, + "loss": 3.1338, + "step": 472000 + }, + { + "epoch": 0.1468835050600979, + "grad_norm": 18.28261375427246, + "learning_rate": 4.75519415823317e-05, + "loss": 3.1421, + "step": 472500 + }, + { + "epoch": 0.14703893734058476, + "grad_norm": 7.843385219573975, + "learning_rate": 4.754935104432359e-05, + "loss": 3.1869, + "step": 473000 + }, + { + "epoch": 0.14719436962107163, + "grad_norm": 8.086275100708008, + "learning_rate": 4.7546760506315476e-05, + "loss": 3.0975, + "step": 473500 + }, + { + "epoch": 0.14734980190155852, + "grad_norm": 7.200100421905518, + "learning_rate": 4.7544169968307364e-05, + "loss": 3.1329, + "step": 474000 + }, + { + "epoch": 0.1475052341820454, + "grad_norm": 10.683578491210938, + "learning_rate": 4.7541579430299244e-05, + "loss": 3.1491, + "step": 474500 + }, + { + "epoch": 0.14766066646253226, + "grad_norm": 7.068333625793457, + "learning_rate": 4.753898889229113e-05, + "loss": 3.1405, + "step": 475000 + }, + { + "epoch": 0.14781609874301915, + "grad_norm": 7.976680278778076, + "learning_rate": 4.753639835428302e-05, + "loss": 3.1335, + "step": 475500 + }, + { + "epoch": 0.14797153102350602, + "grad_norm": 6.7269978523254395, + "learning_rate": 4.75338078162749e-05, + "loss": 3.121, + "step": 476000 + }, + { + "epoch": 0.1481269633039929, + "grad_norm": 6.951367378234863, + "learning_rate": 4.7531217278266786e-05, + "loss": 3.1275, + "step": 476500 + }, + { + "epoch": 0.14828239558447978, + "grad_norm": 8.619511604309082, + "learning_rate": 4.752862674025867e-05, + "loss": 3.0967, + "step": 477000 + }, + { + "epoch": 0.14843782786496665, + "grad_norm": 7.940844535827637, + "learning_rate": 4.752603620225056e-05, + "loss": 3.1215, + "step": 477500 + }, + { + "epoch": 0.14859326014545352, + "grad_norm": 7.713293075561523, + "learning_rate": 4.752344566424245e-05, + "loss": 3.1883, + "step": 478000 + }, + { + "epoch": 0.1487486924259404, + "grad_norm": 11.740067481994629, + "learning_rate": 4.7520855126234334e-05, + "loss": 3.1687, + "step": 478500 + }, + { + "epoch": 0.14890412470642728, + "grad_norm": 13.193681716918945, + "learning_rate": 4.7518264588226215e-05, + "loss": 3.1246, + "step": 479000 + }, + { + "epoch": 0.14905955698691414, + "grad_norm": 8.217891693115234, + "learning_rate": 4.75156740502181e-05, + "loss": 3.1426, + "step": 479500 + }, + { + "epoch": 0.14921498926740104, + "grad_norm": 15.731176376342773, + "learning_rate": 4.751308351220998e-05, + "loss": 3.1154, + "step": 480000 + }, + { + "epoch": 0.1493704215478879, + "grad_norm": 9.531112670898438, + "learning_rate": 4.751049297420187e-05, + "loss": 3.0991, + "step": 480500 + }, + { + "epoch": 0.14952585382837477, + "grad_norm": 9.721943855285645, + "learning_rate": 4.750790243619376e-05, + "loss": 3.0793, + "step": 481000 + }, + { + "epoch": 0.14968128610886167, + "grad_norm": 8.553193092346191, + "learning_rate": 4.750531189818564e-05, + "loss": 3.158, + "step": 481500 + }, + { + "epoch": 0.14983671838934853, + "grad_norm": 10.555859565734863, + "learning_rate": 4.7502721360177524e-05, + "loss": 3.0712, + "step": 482000 + }, + { + "epoch": 0.1499921506698354, + "grad_norm": 9.589221954345703, + "learning_rate": 4.750013082216941e-05, + "loss": 3.1489, + "step": 482500 + }, + { + "epoch": 0.1501475829503223, + "grad_norm": 13.968549728393555, + "learning_rate": 4.74975402841613e-05, + "loss": 3.1575, + "step": 483000 + }, + { + "epoch": 0.15030301523080916, + "grad_norm": 10.576818466186523, + "learning_rate": 4.7494949746153186e-05, + "loss": 3.1299, + "step": 483500 + }, + { + "epoch": 0.15045844751129603, + "grad_norm": 8.18901252746582, + "learning_rate": 4.749235920814507e-05, + "loss": 3.0897, + "step": 484000 + }, + { + "epoch": 0.15061387979178292, + "grad_norm": 12.111784934997559, + "learning_rate": 4.748976867013695e-05, + "loss": 3.1082, + "step": 484500 + }, + { + "epoch": 0.1507693120722698, + "grad_norm": 7.003484725952148, + "learning_rate": 4.748717813212884e-05, + "loss": 3.1569, + "step": 485000 + }, + { + "epoch": 0.15092474435275666, + "grad_norm": 8.379434585571289, + "learning_rate": 4.748458759412072e-05, + "loss": 3.1423, + "step": 485500 + }, + { + "epoch": 0.15108017663324355, + "grad_norm": 9.115384101867676, + "learning_rate": 4.748199705611261e-05, + "loss": 3.1081, + "step": 486000 + }, + { + "epoch": 0.15123560891373042, + "grad_norm": 13.299870491027832, + "learning_rate": 4.7479406518104495e-05, + "loss": 3.1484, + "step": 486500 + }, + { + "epoch": 0.15139104119421729, + "grad_norm": 8.720069885253906, + "learning_rate": 4.7476815980096375e-05, + "loss": 3.1213, + "step": 487000 + }, + { + "epoch": 0.15154647347470418, + "grad_norm": 10.33544635772705, + "learning_rate": 4.747422544208827e-05, + "loss": 3.1419, + "step": 487500 + }, + { + "epoch": 0.15170190575519105, + "grad_norm": 7.469802379608154, + "learning_rate": 4.7471634904080156e-05, + "loss": 3.155, + "step": 488000 + }, + { + "epoch": 0.1518573380356779, + "grad_norm": 15.078301429748535, + "learning_rate": 4.746904436607204e-05, + "loss": 3.117, + "step": 488500 + }, + { + "epoch": 0.1520127703161648, + "grad_norm": 7.588748455047607, + "learning_rate": 4.7466453828063924e-05, + "loss": 3.1147, + "step": 489000 + }, + { + "epoch": 0.15216820259665167, + "grad_norm": 7.8488240242004395, + "learning_rate": 4.746386329005581e-05, + "loss": 3.1899, + "step": 489500 + }, + { + "epoch": 0.15232363487713854, + "grad_norm": 7.463630676269531, + "learning_rate": 4.746127275204769e-05, + "loss": 3.1639, + "step": 490000 + }, + { + "epoch": 0.15247906715762544, + "grad_norm": 7.649951457977295, + "learning_rate": 4.745868221403958e-05, + "loss": 3.1396, + "step": 490500 + }, + { + "epoch": 0.1526344994381123, + "grad_norm": 6.439744472503662, + "learning_rate": 4.745609167603146e-05, + "loss": 3.1164, + "step": 491000 + }, + { + "epoch": 0.15278993171859917, + "grad_norm": 11.285918235778809, + "learning_rate": 4.7453501138023346e-05, + "loss": 3.1563, + "step": 491500 + }, + { + "epoch": 0.15294536399908606, + "grad_norm": 7.48683500289917, + "learning_rate": 4.745091060001523e-05, + "loss": 3.1414, + "step": 492000 + }, + { + "epoch": 0.15310079627957293, + "grad_norm": 6.754961967468262, + "learning_rate": 4.744832006200712e-05, + "loss": 3.1522, + "step": 492500 + }, + { + "epoch": 0.1532562285600598, + "grad_norm": 8.791010856628418, + "learning_rate": 4.744572952399901e-05, + "loss": 3.1471, + "step": 493000 + }, + { + "epoch": 0.1534116608405467, + "grad_norm": 8.639376640319824, + "learning_rate": 4.7443138985990895e-05, + "loss": 3.14, + "step": 493500 + }, + { + "epoch": 0.15356709312103356, + "grad_norm": 14.548735618591309, + "learning_rate": 4.7440548447982775e-05, + "loss": 3.1338, + "step": 494000 + }, + { + "epoch": 0.15372252540152043, + "grad_norm": 5.292453289031982, + "learning_rate": 4.743795790997466e-05, + "loss": 3.1256, + "step": 494500 + }, + { + "epoch": 0.15387795768200732, + "grad_norm": 9.916193962097168, + "learning_rate": 4.743536737196655e-05, + "loss": 3.0866, + "step": 495000 + }, + { + "epoch": 0.1540333899624942, + "grad_norm": 6.941450119018555, + "learning_rate": 4.743277683395843e-05, + "loss": 3.187, + "step": 495500 + }, + { + "epoch": 0.15418882224298106, + "grad_norm": 8.55876350402832, + "learning_rate": 4.743018629595032e-05, + "loss": 3.0956, + "step": 496000 + }, + { + "epoch": 0.15434425452346795, + "grad_norm": 8.726902961730957, + "learning_rate": 4.7427595757942204e-05, + "loss": 3.1401, + "step": 496500 + }, + { + "epoch": 0.15449968680395482, + "grad_norm": 7.2924580574035645, + "learning_rate": 4.7425005219934085e-05, + "loss": 3.1031, + "step": 497000 + }, + { + "epoch": 0.15465511908444168, + "grad_norm": 8.910125732421875, + "learning_rate": 4.742241468192598e-05, + "loss": 3.073, + "step": 497500 + }, + { + "epoch": 0.15481055136492858, + "grad_norm": 18.382535934448242, + "learning_rate": 4.741982414391786e-05, + "loss": 3.1011, + "step": 498000 + }, + { + "epoch": 0.15496598364541544, + "grad_norm": 7.255028247833252, + "learning_rate": 4.7417233605909746e-05, + "loss": 3.0839, + "step": 498500 + }, + { + "epoch": 0.1551214159259023, + "grad_norm": 17.645824432373047, + "learning_rate": 4.741464306790163e-05, + "loss": 3.1512, + "step": 499000 + }, + { + "epoch": 0.1552768482063892, + "grad_norm": 7.915125846862793, + "learning_rate": 4.7412052529893514e-05, + "loss": 3.1445, + "step": 499500 + }, + { + "epoch": 0.15543228048687607, + "grad_norm": 9.787822723388672, + "learning_rate": 4.74094619918854e-05, + "loss": 3.135, + "step": 500000 + }, + { + "epoch": 0.15558771276736294, + "grad_norm": 6.089315891265869, + "learning_rate": 4.740687145387729e-05, + "loss": 3.1201, + "step": 500500 + }, + { + "epoch": 0.15574314504784983, + "grad_norm": 7.732711315155029, + "learning_rate": 4.740428091586917e-05, + "loss": 3.1178, + "step": 501000 + }, + { + "epoch": 0.1558985773283367, + "grad_norm": 7.962092399597168, + "learning_rate": 4.7401690377861055e-05, + "loss": 3.124, + "step": 501500 + }, + { + "epoch": 0.15605400960882357, + "grad_norm": 7.324747085571289, + "learning_rate": 4.739909983985294e-05, + "loss": 3.1248, + "step": 502000 + }, + { + "epoch": 0.15620944188931046, + "grad_norm": 12.958382606506348, + "learning_rate": 4.739650930184483e-05, + "loss": 3.1244, + "step": 502500 + }, + { + "epoch": 0.15636487416979733, + "grad_norm": 8.758108139038086, + "learning_rate": 4.739391876383672e-05, + "loss": 3.1077, + "step": 503000 + }, + { + "epoch": 0.1565203064502842, + "grad_norm": 7.021946430206299, + "learning_rate": 4.73913282258286e-05, + "loss": 3.1318, + "step": 503500 + }, + { + "epoch": 0.1566757387307711, + "grad_norm": 22.6928653717041, + "learning_rate": 4.7388737687820484e-05, + "loss": 3.1027, + "step": 504000 + }, + { + "epoch": 0.15683117101125796, + "grad_norm": 7.140137672424316, + "learning_rate": 4.738614714981237e-05, + "loss": 3.1527, + "step": 504500 + }, + { + "epoch": 0.15698660329174483, + "grad_norm": 8.573493957519531, + "learning_rate": 4.738355661180425e-05, + "loss": 3.1023, + "step": 505000 + }, + { + "epoch": 0.15714203557223172, + "grad_norm": 8.161904335021973, + "learning_rate": 4.738096607379614e-05, + "loss": 3.1013, + "step": 505500 + }, + { + "epoch": 0.1572974678527186, + "grad_norm": 7.467336177825928, + "learning_rate": 4.7378375535788026e-05, + "loss": 3.1351, + "step": 506000 + }, + { + "epoch": 0.15745290013320545, + "grad_norm": 9.063158988952637, + "learning_rate": 4.7375784997779907e-05, + "loss": 3.1551, + "step": 506500 + }, + { + "epoch": 0.15760833241369235, + "grad_norm": 5.765160083770752, + "learning_rate": 4.7373194459771794e-05, + "loss": 3.1307, + "step": 507000 + }, + { + "epoch": 0.15776376469417921, + "grad_norm": 6.123049259185791, + "learning_rate": 4.737060392176369e-05, + "loss": 3.107, + "step": 507500 + }, + { + "epoch": 0.15791919697466608, + "grad_norm": 7.310281753540039, + "learning_rate": 4.736801338375557e-05, + "loss": 3.1536, + "step": 508000 + }, + { + "epoch": 0.15807462925515298, + "grad_norm": 8.456357955932617, + "learning_rate": 4.7365422845747455e-05, + "loss": 3.1204, + "step": 508500 + }, + { + "epoch": 0.15823006153563984, + "grad_norm": 7.946829795837402, + "learning_rate": 4.7362832307739336e-05, + "loss": 3.118, + "step": 509000 + }, + { + "epoch": 0.1583854938161267, + "grad_norm": 10.249110221862793, + "learning_rate": 4.736024176973122e-05, + "loss": 3.1363, + "step": 509500 + }, + { + "epoch": 0.1585409260966136, + "grad_norm": 7.275257587432861, + "learning_rate": 4.735765123172311e-05, + "loss": 3.12, + "step": 510000 + }, + { + "epoch": 0.15869635837710047, + "grad_norm": 10.096506118774414, + "learning_rate": 4.735506069371499e-05, + "loss": 3.0747, + "step": 510500 + }, + { + "epoch": 0.15885179065758734, + "grad_norm": 7.495107650756836, + "learning_rate": 4.735247015570688e-05, + "loss": 3.1503, + "step": 511000 + }, + { + "epoch": 0.15900722293807423, + "grad_norm": 7.7461018562316895, + "learning_rate": 4.7349879617698765e-05, + "loss": 3.1203, + "step": 511500 + }, + { + "epoch": 0.1591626552185611, + "grad_norm": 6.908226013183594, + "learning_rate": 4.734728907969065e-05, + "loss": 3.1471, + "step": 512000 + }, + { + "epoch": 0.15931808749904797, + "grad_norm": 38.44561004638672, + "learning_rate": 4.734469854168254e-05, + "loss": 3.0984, + "step": 512500 + }, + { + "epoch": 0.15947351977953486, + "grad_norm": 7.0903544425964355, + "learning_rate": 4.7342108003674426e-05, + "loss": 3.1264, + "step": 513000 + }, + { + "epoch": 0.15962895206002173, + "grad_norm": 7.831568241119385, + "learning_rate": 4.7339517465666306e-05, + "loss": 3.1493, + "step": 513500 + }, + { + "epoch": 0.1597843843405086, + "grad_norm": 8.5963134765625, + "learning_rate": 4.7336926927658194e-05, + "loss": 3.1003, + "step": 514000 + }, + { + "epoch": 0.1599398166209955, + "grad_norm": 11.267167091369629, + "learning_rate": 4.733433638965008e-05, + "loss": 3.1026, + "step": 514500 + }, + { + "epoch": 0.16009524890148236, + "grad_norm": 12.082208633422852, + "learning_rate": 4.733174585164196e-05, + "loss": 3.1095, + "step": 515000 + }, + { + "epoch": 0.16025068118196922, + "grad_norm": 9.97931957244873, + "learning_rate": 4.732915531363385e-05, + "loss": 3.1522, + "step": 515500 + }, + { + "epoch": 0.16040611346245612, + "grad_norm": 8.864288330078125, + "learning_rate": 4.732656477562573e-05, + "loss": 3.1543, + "step": 516000 + }, + { + "epoch": 0.16056154574294298, + "grad_norm": 6.7947998046875, + "learning_rate": 4.7323974237617616e-05, + "loss": 3.1712, + "step": 516500 + }, + { + "epoch": 0.16071697802342985, + "grad_norm": 7.627294063568115, + "learning_rate": 4.73213836996095e-05, + "loss": 3.1423, + "step": 517000 + }, + { + "epoch": 0.16087241030391675, + "grad_norm": 8.899734497070312, + "learning_rate": 4.731879316160139e-05, + "loss": 3.1577, + "step": 517500 + }, + { + "epoch": 0.1610278425844036, + "grad_norm": 9.609889030456543, + "learning_rate": 4.731620262359328e-05, + "loss": 3.1199, + "step": 518000 + }, + { + "epoch": 0.16118327486489048, + "grad_norm": 6.127584934234619, + "learning_rate": 4.7313612085585164e-05, + "loss": 3.2014, + "step": 518500 + }, + { + "epoch": 0.16133870714537737, + "grad_norm": 12.97293472290039, + "learning_rate": 4.7311021547577045e-05, + "loss": 3.1367, + "step": 519000 + }, + { + "epoch": 0.16149413942586424, + "grad_norm": 7.39502477645874, + "learning_rate": 4.730843100956893e-05, + "loss": 3.1716, + "step": 519500 + }, + { + "epoch": 0.1616495717063511, + "grad_norm": 7.809886932373047, + "learning_rate": 4.730584047156082e-05, + "loss": 3.1363, + "step": 520000 + }, + { + "epoch": 0.161805003986838, + "grad_norm": 7.567405700683594, + "learning_rate": 4.73032499335527e-05, + "loss": 3.1473, + "step": 520500 + }, + { + "epoch": 0.16196043626732487, + "grad_norm": 8.093345642089844, + "learning_rate": 4.7300659395544587e-05, + "loss": 3.11, + "step": 521000 + }, + { + "epoch": 0.16211586854781174, + "grad_norm": 5.077154636383057, + "learning_rate": 4.7298068857536474e-05, + "loss": 3.1825, + "step": 521500 + }, + { + "epoch": 0.16227130082829863, + "grad_norm": 8.277685165405273, + "learning_rate": 4.729547831952836e-05, + "loss": 3.1163, + "step": 522000 + }, + { + "epoch": 0.1624267331087855, + "grad_norm": 6.650649547576904, + "learning_rate": 4.729288778152025e-05, + "loss": 3.1416, + "step": 522500 + }, + { + "epoch": 0.16258216538927237, + "grad_norm": 8.062630653381348, + "learning_rate": 4.729029724351213e-05, + "loss": 3.1474, + "step": 523000 + }, + { + "epoch": 0.16273759766975926, + "grad_norm": 8.593581199645996, + "learning_rate": 4.7287706705504016e-05, + "loss": 3.1184, + "step": 523500 + }, + { + "epoch": 0.16289302995024613, + "grad_norm": 8.458962440490723, + "learning_rate": 4.72851161674959e-05, + "loss": 3.125, + "step": 524000 + }, + { + "epoch": 0.163048462230733, + "grad_norm": 8.705788612365723, + "learning_rate": 4.728252562948778e-05, + "loss": 3.1748, + "step": 524500 + }, + { + "epoch": 0.1632038945112199, + "grad_norm": 9.431114196777344, + "learning_rate": 4.727993509147967e-05, + "loss": 3.1748, + "step": 525000 + }, + { + "epoch": 0.16335932679170675, + "grad_norm": 21.071565628051758, + "learning_rate": 4.727734455347156e-05, + "loss": 3.132, + "step": 525500 + }, + { + "epoch": 0.16351475907219362, + "grad_norm": 10.906435012817383, + "learning_rate": 4.727475401546344e-05, + "loss": 3.1191, + "step": 526000 + }, + { + "epoch": 0.16367019135268052, + "grad_norm": 7.357880115509033, + "learning_rate": 4.7272163477455325e-05, + "loss": 3.1003, + "step": 526500 + }, + { + "epoch": 0.16382562363316738, + "grad_norm": 8.207454681396484, + "learning_rate": 4.726957293944721e-05, + "loss": 3.101, + "step": 527000 + }, + { + "epoch": 0.16398105591365425, + "grad_norm": 10.820603370666504, + "learning_rate": 4.72669824014391e-05, + "loss": 3.1489, + "step": 527500 + }, + { + "epoch": 0.16413648819414114, + "grad_norm": 17.1276912689209, + "learning_rate": 4.7264391863430986e-05, + "loss": 3.1258, + "step": 528000 + }, + { + "epoch": 0.164291920474628, + "grad_norm": 6.727423191070557, + "learning_rate": 4.726180132542287e-05, + "loss": 3.1276, + "step": 528500 + }, + { + "epoch": 0.16444735275511488, + "grad_norm": 8.234780311584473, + "learning_rate": 4.7259210787414754e-05, + "loss": 3.1077, + "step": 529000 + }, + { + "epoch": 0.16460278503560177, + "grad_norm": 6.179680824279785, + "learning_rate": 4.725662024940664e-05, + "loss": 3.1286, + "step": 529500 + }, + { + "epoch": 0.16475821731608864, + "grad_norm": 6.2046356201171875, + "learning_rate": 4.725402971139852e-05, + "loss": 3.1007, + "step": 530000 + }, + { + "epoch": 0.1649136495965755, + "grad_norm": 6.106831073760986, + "learning_rate": 4.725143917339041e-05, + "loss": 3.0742, + "step": 530500 + }, + { + "epoch": 0.1650690818770624, + "grad_norm": 8.481727600097656, + "learning_rate": 4.7248848635382296e-05, + "loss": 3.1411, + "step": 531000 + }, + { + "epoch": 0.16522451415754927, + "grad_norm": 8.26651382446289, + "learning_rate": 4.724625809737418e-05, + "loss": 3.1046, + "step": 531500 + }, + { + "epoch": 0.16537994643803614, + "grad_norm": 8.52824592590332, + "learning_rate": 4.724366755936607e-05, + "loss": 3.0808, + "step": 532000 + }, + { + "epoch": 0.16553537871852303, + "grad_norm": 8.621288299560547, + "learning_rate": 4.724107702135796e-05, + "loss": 3.1559, + "step": 532500 + }, + { + "epoch": 0.1656908109990099, + "grad_norm": 7.1093950271606445, + "learning_rate": 4.723848648334984e-05, + "loss": 3.1219, + "step": 533000 + }, + { + "epoch": 0.16584624327949676, + "grad_norm": 7.698925495147705, + "learning_rate": 4.7235895945341725e-05, + "loss": 3.0727, + "step": 533500 + }, + { + "epoch": 0.16600167555998366, + "grad_norm": 5.589407920837402, + "learning_rate": 4.7233305407333605e-05, + "loss": 3.1459, + "step": 534000 + }, + { + "epoch": 0.16615710784047052, + "grad_norm": 12.173215866088867, + "learning_rate": 4.723071486932549e-05, + "loss": 3.0938, + "step": 534500 + }, + { + "epoch": 0.1663125401209574, + "grad_norm": 8.944339752197266, + "learning_rate": 4.722812433131738e-05, + "loss": 3.1596, + "step": 535000 + }, + { + "epoch": 0.1664679724014443, + "grad_norm": 8.729195594787598, + "learning_rate": 4.722553379330926e-05, + "loss": 3.1338, + "step": 535500 + }, + { + "epoch": 0.16662340468193115, + "grad_norm": 6.123769283294678, + "learning_rate": 4.722294325530115e-05, + "loss": 3.1354, + "step": 536000 + }, + { + "epoch": 0.16677883696241802, + "grad_norm": 7.131565570831299, + "learning_rate": 4.7220352717293034e-05, + "loss": 3.0898, + "step": 536500 + }, + { + "epoch": 0.16693426924290491, + "grad_norm": 8.59522819519043, + "learning_rate": 4.721776217928492e-05, + "loss": 3.1154, + "step": 537000 + }, + { + "epoch": 0.16708970152339178, + "grad_norm": 7.426102638244629, + "learning_rate": 4.721517164127681e-05, + "loss": 3.1064, + "step": 537500 + }, + { + "epoch": 0.16724513380387865, + "grad_norm": 8.341367721557617, + "learning_rate": 4.7212581103268696e-05, + "loss": 3.117, + "step": 538000 + }, + { + "epoch": 0.16740056608436554, + "grad_norm": 6.746263027191162, + "learning_rate": 4.7209990565260576e-05, + "loss": 3.1397, + "step": 538500 + }, + { + "epoch": 0.1675559983648524, + "grad_norm": 6.4972453117370605, + "learning_rate": 4.720740002725246e-05, + "loss": 3.1474, + "step": 539000 + }, + { + "epoch": 0.16771143064533928, + "grad_norm": 8.854972839355469, + "learning_rate": 4.7204809489244344e-05, + "loss": 3.1303, + "step": 539500 + }, + { + "epoch": 0.16786686292582617, + "grad_norm": 6.571467876434326, + "learning_rate": 4.720221895123623e-05, + "loss": 3.115, + "step": 540000 + }, + { + "epoch": 0.16802229520631304, + "grad_norm": 14.807621955871582, + "learning_rate": 4.719962841322812e-05, + "loss": 3.118, + "step": 540500 + }, + { + "epoch": 0.1681777274867999, + "grad_norm": 7.903375148773193, + "learning_rate": 4.719703787522e-05, + "loss": 3.1481, + "step": 541000 + }, + { + "epoch": 0.1683331597672868, + "grad_norm": 12.843640327453613, + "learning_rate": 4.719444733721189e-05, + "loss": 3.1561, + "step": 541500 + }, + { + "epoch": 0.16848859204777367, + "grad_norm": 8.120024681091309, + "learning_rate": 4.719185679920378e-05, + "loss": 3.1379, + "step": 542000 + }, + { + "epoch": 0.16864402432826053, + "grad_norm": 8.4661865234375, + "learning_rate": 4.718926626119566e-05, + "loss": 3.1002, + "step": 542500 + }, + { + "epoch": 0.16879945660874743, + "grad_norm": 6.850857734680176, + "learning_rate": 4.718667572318755e-05, + "loss": 3.1022, + "step": 543000 + }, + { + "epoch": 0.1689548888892343, + "grad_norm": 11.76528263092041, + "learning_rate": 4.7184085185179434e-05, + "loss": 3.1454, + "step": 543500 + }, + { + "epoch": 0.16911032116972116, + "grad_norm": 19.232179641723633, + "learning_rate": 4.7181494647171314e-05, + "loss": 3.1522, + "step": 544000 + }, + { + "epoch": 0.16926575345020806, + "grad_norm": 13.275445938110352, + "learning_rate": 4.71789041091632e-05, + "loss": 3.1258, + "step": 544500 + }, + { + "epoch": 0.16942118573069492, + "grad_norm": 7.983952045440674, + "learning_rate": 4.717631357115509e-05, + "loss": 3.1507, + "step": 545000 + }, + { + "epoch": 0.1695766180111818, + "grad_norm": 7.790310382843018, + "learning_rate": 4.717372303314697e-05, + "loss": 3.1403, + "step": 545500 + }, + { + "epoch": 0.16973205029166868, + "grad_norm": 8.058002471923828, + "learning_rate": 4.7171132495138856e-05, + "loss": 3.1469, + "step": 546000 + }, + { + "epoch": 0.16988748257215555, + "grad_norm": 9.657066345214844, + "learning_rate": 4.716854195713074e-05, + "loss": 3.0971, + "step": 546500 + }, + { + "epoch": 0.17004291485264242, + "grad_norm": 7.3687567710876465, + "learning_rate": 4.716595141912263e-05, + "loss": 3.1393, + "step": 547000 + }, + { + "epoch": 0.1701983471331293, + "grad_norm": 7.705102920532227, + "learning_rate": 4.716336088111452e-05, + "loss": 3.1228, + "step": 547500 + }, + { + "epoch": 0.17035377941361618, + "grad_norm": 9.433982849121094, + "learning_rate": 4.71607703431064e-05, + "loss": 3.1379, + "step": 548000 + }, + { + "epoch": 0.17050921169410305, + "grad_norm": 7.570931434631348, + "learning_rate": 4.7158179805098285e-05, + "loss": 3.1946, + "step": 548500 + }, + { + "epoch": 0.17066464397458994, + "grad_norm": 9.69499683380127, + "learning_rate": 4.715558926709017e-05, + "loss": 3.0882, + "step": 549000 + }, + { + "epoch": 0.1708200762550768, + "grad_norm": 9.033105850219727, + "learning_rate": 4.715299872908205e-05, + "loss": 3.1178, + "step": 549500 + }, + { + "epoch": 0.17097550853556368, + "grad_norm": 9.955949783325195, + "learning_rate": 4.715040819107394e-05, + "loss": 3.1025, + "step": 550000 + }, + { + "epoch": 0.17113094081605057, + "grad_norm": 28.433761596679688, + "learning_rate": 4.714781765306583e-05, + "loss": 3.1286, + "step": 550500 + }, + { + "epoch": 0.17128637309653744, + "grad_norm": 10.342140197753906, + "learning_rate": 4.714522711505771e-05, + "loss": 3.193, + "step": 551000 + }, + { + "epoch": 0.1714418053770243, + "grad_norm": 8.3802490234375, + "learning_rate": 4.71426365770496e-05, + "loss": 3.1298, + "step": 551500 + }, + { + "epoch": 0.1715972376575112, + "grad_norm": 9.201905250549316, + "learning_rate": 4.714004603904148e-05, + "loss": 3.1118, + "step": 552000 + }, + { + "epoch": 0.17175266993799806, + "grad_norm": 7.247037410736084, + "learning_rate": 4.713745550103337e-05, + "loss": 3.1398, + "step": 552500 + }, + { + "epoch": 0.17190810221848493, + "grad_norm": 8.779488563537598, + "learning_rate": 4.7134864963025256e-05, + "loss": 3.1248, + "step": 553000 + }, + { + "epoch": 0.17206353449897183, + "grad_norm": 8.407361030578613, + "learning_rate": 4.7132274425017136e-05, + "loss": 3.0791, + "step": 553500 + }, + { + "epoch": 0.1722189667794587, + "grad_norm": 7.888935565948486, + "learning_rate": 4.7129683887009024e-05, + "loss": 3.1687, + "step": 554000 + }, + { + "epoch": 0.17237439905994556, + "grad_norm": 7.915614128112793, + "learning_rate": 4.712709334900091e-05, + "loss": 3.1791, + "step": 554500 + }, + { + "epoch": 0.17252983134043245, + "grad_norm": 7.7378764152526855, + "learning_rate": 4.712450281099279e-05, + "loss": 3.1802, + "step": 555000 + }, + { + "epoch": 0.17268526362091932, + "grad_norm": 8.014586448669434, + "learning_rate": 4.712191227298468e-05, + "loss": 3.1176, + "step": 555500 + }, + { + "epoch": 0.1728406959014062, + "grad_norm": 8.049857139587402, + "learning_rate": 4.7119321734976565e-05, + "loss": 3.0811, + "step": 556000 + }, + { + "epoch": 0.17299612818189308, + "grad_norm": 8.681418418884277, + "learning_rate": 4.711673119696845e-05, + "loss": 3.1239, + "step": 556500 + }, + { + "epoch": 0.17315156046237995, + "grad_norm": 13.479007720947266, + "learning_rate": 4.711414065896034e-05, + "loss": 3.0974, + "step": 557000 + }, + { + "epoch": 0.17330699274286682, + "grad_norm": 7.688525676727295, + "learning_rate": 4.711155012095222e-05, + "loss": 3.0926, + "step": 557500 + }, + { + "epoch": 0.1734624250233537, + "grad_norm": 18.910005569458008, + "learning_rate": 4.710895958294411e-05, + "loss": 3.1075, + "step": 558000 + }, + { + "epoch": 0.17361785730384058, + "grad_norm": 6.72324800491333, + "learning_rate": 4.7106369044935994e-05, + "loss": 3.1323, + "step": 558500 + }, + { + "epoch": 0.17377328958432744, + "grad_norm": 12.056007385253906, + "learning_rate": 4.7103778506927875e-05, + "loss": 3.1129, + "step": 559000 + }, + { + "epoch": 0.17392872186481434, + "grad_norm": 7.566714763641357, + "learning_rate": 4.710118796891976e-05, + "loss": 3.1163, + "step": 559500 + }, + { + "epoch": 0.1740841541453012, + "grad_norm": 7.930150508880615, + "learning_rate": 4.709859743091165e-05, + "loss": 3.1116, + "step": 560000 + }, + { + "epoch": 0.17423958642578807, + "grad_norm": 13.858633995056152, + "learning_rate": 4.709600689290353e-05, + "loss": 3.124, + "step": 560500 + }, + { + "epoch": 0.17439501870627497, + "grad_norm": 8.884865760803223, + "learning_rate": 4.7093416354895417e-05, + "loss": 3.1083, + "step": 561000 + }, + { + "epoch": 0.17455045098676183, + "grad_norm": 7.241312503814697, + "learning_rate": 4.709082581688731e-05, + "loss": 3.1127, + "step": 561500 + }, + { + "epoch": 0.1747058832672487, + "grad_norm": 7.3461456298828125, + "learning_rate": 4.708823527887919e-05, + "loss": 3.2157, + "step": 562000 + }, + { + "epoch": 0.1748613155477356, + "grad_norm": 9.360142707824707, + "learning_rate": 4.708564474087108e-05, + "loss": 3.1214, + "step": 562500 + }, + { + "epoch": 0.17501674782822246, + "grad_norm": 7.096540451049805, + "learning_rate": 4.7083054202862965e-05, + "loss": 3.1495, + "step": 563000 + }, + { + "epoch": 0.17517218010870933, + "grad_norm": 9.06702709197998, + "learning_rate": 4.7080463664854846e-05, + "loss": 3.1276, + "step": 563500 + }, + { + "epoch": 0.17532761238919622, + "grad_norm": 8.80846881866455, + "learning_rate": 4.707787312684673e-05, + "loss": 3.1402, + "step": 564000 + }, + { + "epoch": 0.1754830446696831, + "grad_norm": 7.6970744132995605, + "learning_rate": 4.707528258883861e-05, + "loss": 3.1006, + "step": 564500 + }, + { + "epoch": 0.17563847695016996, + "grad_norm": 10.562844276428223, + "learning_rate": 4.70726920508305e-05, + "loss": 3.0841, + "step": 565000 + }, + { + "epoch": 0.17579390923065685, + "grad_norm": 9.86268424987793, + "learning_rate": 4.707010151282239e-05, + "loss": 3.1053, + "step": 565500 + }, + { + "epoch": 0.17594934151114372, + "grad_norm": 7.686104774475098, + "learning_rate": 4.7067510974814275e-05, + "loss": 3.1495, + "step": 566000 + }, + { + "epoch": 0.1761047737916306, + "grad_norm": 7.766209602355957, + "learning_rate": 4.706492043680616e-05, + "loss": 3.1298, + "step": 566500 + }, + { + "epoch": 0.17626020607211748, + "grad_norm": 8.606474876403809, + "learning_rate": 4.706232989879805e-05, + "loss": 3.116, + "step": 567000 + }, + { + "epoch": 0.17641563835260435, + "grad_norm": 7.518901824951172, + "learning_rate": 4.705973936078993e-05, + "loss": 3.136, + "step": 567500 + }, + { + "epoch": 0.17657107063309121, + "grad_norm": 9.478394508361816, + "learning_rate": 4.7057148822781816e-05, + "loss": 3.1331, + "step": 568000 + }, + { + "epoch": 0.1767265029135781, + "grad_norm": 7.522947788238525, + "learning_rate": 4.7054558284773704e-05, + "loss": 3.133, + "step": 568500 + }, + { + "epoch": 0.17688193519406498, + "grad_norm": 9.82868766784668, + "learning_rate": 4.7051967746765584e-05, + "loss": 3.1298, + "step": 569000 + }, + { + "epoch": 0.17703736747455184, + "grad_norm": 11.214179039001465, + "learning_rate": 4.704937720875747e-05, + "loss": 3.0579, + "step": 569500 + }, + { + "epoch": 0.17719279975503874, + "grad_norm": 8.263320922851562, + "learning_rate": 4.704678667074935e-05, + "loss": 3.1257, + "step": 570000 + }, + { + "epoch": 0.1773482320355256, + "grad_norm": 10.637370109558105, + "learning_rate": 4.704419613274124e-05, + "loss": 3.1275, + "step": 570500 + }, + { + "epoch": 0.17750366431601247, + "grad_norm": 11.592597961425781, + "learning_rate": 4.7041605594733126e-05, + "loss": 3.1146, + "step": 571000 + }, + { + "epoch": 0.17765909659649937, + "grad_norm": 6.318106174468994, + "learning_rate": 4.703901505672501e-05, + "loss": 3.1072, + "step": 571500 + }, + { + "epoch": 0.17781452887698623, + "grad_norm": 5.980849266052246, + "learning_rate": 4.70364245187169e-05, + "loss": 3.1208, + "step": 572000 + }, + { + "epoch": 0.1779699611574731, + "grad_norm": 13.741256713867188, + "learning_rate": 4.703383398070879e-05, + "loss": 3.1294, + "step": 572500 + }, + { + "epoch": 0.17812539343796, + "grad_norm": 13.065375328063965, + "learning_rate": 4.703124344270067e-05, + "loss": 3.0792, + "step": 573000 + }, + { + "epoch": 0.17828082571844686, + "grad_norm": 12.534862518310547, + "learning_rate": 4.7028652904692555e-05, + "loss": 3.1069, + "step": 573500 + }, + { + "epoch": 0.17843625799893373, + "grad_norm": 9.196142196655273, + "learning_rate": 4.702606236668444e-05, + "loss": 3.0883, + "step": 574000 + }, + { + "epoch": 0.17859169027942062, + "grad_norm": 20.366926193237305, + "learning_rate": 4.702347182867632e-05, + "loss": 3.1156, + "step": 574500 + }, + { + "epoch": 0.1787471225599075, + "grad_norm": 7.230238437652588, + "learning_rate": 4.702088129066821e-05, + "loss": 3.1093, + "step": 575000 + }, + { + "epoch": 0.17890255484039436, + "grad_norm": 7.573201656341553, + "learning_rate": 4.7018290752660097e-05, + "loss": 3.1328, + "step": 575500 + }, + { + "epoch": 0.17905798712088125, + "grad_norm": 8.659793853759766, + "learning_rate": 4.7015700214651984e-05, + "loss": 3.117, + "step": 576000 + }, + { + "epoch": 0.17921341940136812, + "grad_norm": 6.318301677703857, + "learning_rate": 4.701310967664387e-05, + "loss": 3.0759, + "step": 576500 + }, + { + "epoch": 0.17936885168185498, + "grad_norm": 8.041107177734375, + "learning_rate": 4.701051913863575e-05, + "loss": 3.1552, + "step": 577000 + }, + { + "epoch": 0.17952428396234188, + "grad_norm": 6.955123424530029, + "learning_rate": 4.700792860062764e-05, + "loss": 3.0912, + "step": 577500 + }, + { + "epoch": 0.17967971624282875, + "grad_norm": 8.055022239685059, + "learning_rate": 4.7005338062619526e-05, + "loss": 3.1438, + "step": 578000 + }, + { + "epoch": 0.1798351485233156, + "grad_norm": 8.194991111755371, + "learning_rate": 4.7002747524611406e-05, + "loss": 3.0946, + "step": 578500 + }, + { + "epoch": 0.1799905808038025, + "grad_norm": 24.898630142211914, + "learning_rate": 4.700015698660329e-05, + "loss": 3.1474, + "step": 579000 + }, + { + "epoch": 0.18014601308428937, + "grad_norm": 7.242453575134277, + "learning_rate": 4.699756644859518e-05, + "loss": 3.1516, + "step": 579500 + }, + { + "epoch": 0.18030144536477624, + "grad_norm": 8.266813278198242, + "learning_rate": 4.699497591058706e-05, + "loss": 3.1104, + "step": 580000 + }, + { + "epoch": 0.18045687764526314, + "grad_norm": 10.43005084991455, + "learning_rate": 4.699238537257895e-05, + "loss": 3.1103, + "step": 580500 + }, + { + "epoch": 0.18061230992575, + "grad_norm": 6.324367046356201, + "learning_rate": 4.6989794834570835e-05, + "loss": 3.1279, + "step": 581000 + }, + { + "epoch": 0.18076774220623687, + "grad_norm": 9.593320846557617, + "learning_rate": 4.698720429656272e-05, + "loss": 3.0794, + "step": 581500 + }, + { + "epoch": 0.18092317448672376, + "grad_norm": 7.421764373779297, + "learning_rate": 4.698461375855461e-05, + "loss": 3.1258, + "step": 582000 + }, + { + "epoch": 0.18107860676721063, + "grad_norm": 22.134998321533203, + "learning_rate": 4.698202322054649e-05, + "loss": 3.123, + "step": 582500 + }, + { + "epoch": 0.1812340390476975, + "grad_norm": 7.766335487365723, + "learning_rate": 4.697943268253838e-05, + "loss": 3.1073, + "step": 583000 + }, + { + "epoch": 0.1813894713281844, + "grad_norm": 8.08018684387207, + "learning_rate": 4.6976842144530264e-05, + "loss": 3.093, + "step": 583500 + }, + { + "epoch": 0.18154490360867126, + "grad_norm": 9.001616477966309, + "learning_rate": 4.6974251606522144e-05, + "loss": 3.1485, + "step": 584000 + }, + { + "epoch": 0.18170033588915813, + "grad_norm": 15.216586112976074, + "learning_rate": 4.697166106851403e-05, + "loss": 3.1031, + "step": 584500 + }, + { + "epoch": 0.18185576816964502, + "grad_norm": 9.41524887084961, + "learning_rate": 4.696907053050592e-05, + "loss": 3.1642, + "step": 585000 + }, + { + "epoch": 0.1820112004501319, + "grad_norm": 7.727883338928223, + "learning_rate": 4.6966479992497806e-05, + "loss": 3.101, + "step": 585500 + }, + { + "epoch": 0.18216663273061875, + "grad_norm": 9.485734939575195, + "learning_rate": 4.696388945448969e-05, + "loss": 3.0855, + "step": 586000 + }, + { + "epoch": 0.18232206501110565, + "grad_norm": 7.91703987121582, + "learning_rate": 4.696129891648158e-05, + "loss": 3.1387, + "step": 586500 + }, + { + "epoch": 0.18247749729159252, + "grad_norm": 9.426841735839844, + "learning_rate": 4.695870837847346e-05, + "loss": 3.1486, + "step": 587000 + }, + { + "epoch": 0.18263292957207938, + "grad_norm": 9.23161506652832, + "learning_rate": 4.695611784046535e-05, + "loss": 3.1236, + "step": 587500 + }, + { + "epoch": 0.18278836185256628, + "grad_norm": 5.786877155303955, + "learning_rate": 4.695352730245723e-05, + "loss": 3.0659, + "step": 588000 + }, + { + "epoch": 0.18294379413305314, + "grad_norm": 5.137662410736084, + "learning_rate": 4.6950936764449115e-05, + "loss": 3.113, + "step": 588500 + }, + { + "epoch": 0.18309922641354, + "grad_norm": 5.256279468536377, + "learning_rate": 4.6948346226441e-05, + "loss": 3.145, + "step": 589000 + }, + { + "epoch": 0.1832546586940269, + "grad_norm": 11.975724220275879, + "learning_rate": 4.694575568843288e-05, + "loss": 3.126, + "step": 589500 + }, + { + "epoch": 0.18341009097451377, + "grad_norm": 9.235357284545898, + "learning_rate": 4.694316515042477e-05, + "loss": 3.1099, + "step": 590000 + }, + { + "epoch": 0.18356552325500064, + "grad_norm": 7.817656993865967, + "learning_rate": 4.694057461241666e-05, + "loss": 3.0987, + "step": 590500 + }, + { + "epoch": 0.18372095553548753, + "grad_norm": 6.5022735595703125, + "learning_rate": 4.6937984074408544e-05, + "loss": 3.0812, + "step": 591000 + }, + { + "epoch": 0.1838763878159744, + "grad_norm": 7.329495429992676, + "learning_rate": 4.693539353640043e-05, + "loss": 3.12, + "step": 591500 + }, + { + "epoch": 0.18403182009646127, + "grad_norm": 17.786266326904297, + "learning_rate": 4.693280299839232e-05, + "loss": 3.1359, + "step": 592000 + }, + { + "epoch": 0.18418725237694816, + "grad_norm": 9.662399291992188, + "learning_rate": 4.69302124603842e-05, + "loss": 3.1502, + "step": 592500 + }, + { + "epoch": 0.18434268465743503, + "grad_norm": 8.184389114379883, + "learning_rate": 4.6927621922376086e-05, + "loss": 3.1065, + "step": 593000 + }, + { + "epoch": 0.1844981169379219, + "grad_norm": 8.371453285217285, + "learning_rate": 4.6925031384367966e-05, + "loss": 3.1496, + "step": 593500 + }, + { + "epoch": 0.1846535492184088, + "grad_norm": 9.227255821228027, + "learning_rate": 4.6922440846359854e-05, + "loss": 3.1271, + "step": 594000 + }, + { + "epoch": 0.18480898149889566, + "grad_norm": 8.06972599029541, + "learning_rate": 4.691985030835174e-05, + "loss": 3.1223, + "step": 594500 + }, + { + "epoch": 0.18496441377938252, + "grad_norm": 8.0169038772583, + "learning_rate": 4.691725977034363e-05, + "loss": 3.1306, + "step": 595000 + }, + { + "epoch": 0.18511984605986942, + "grad_norm": 8.753162384033203, + "learning_rate": 4.6914669232335515e-05, + "loss": 3.1274, + "step": 595500 + }, + { + "epoch": 0.1852752783403563, + "grad_norm": 9.54090690612793, + "learning_rate": 4.69120786943274e-05, + "loss": 3.1798, + "step": 596000 + }, + { + "epoch": 0.18543071062084315, + "grad_norm": 7.036049842834473, + "learning_rate": 4.690948815631928e-05, + "loss": 3.1333, + "step": 596500 + }, + { + "epoch": 0.18558614290133005, + "grad_norm": 6.787283420562744, + "learning_rate": 4.690689761831117e-05, + "loss": 3.182, + "step": 597000 + }, + { + "epoch": 0.18574157518181691, + "grad_norm": 9.334185600280762, + "learning_rate": 4.690430708030306e-05, + "loss": 3.1267, + "step": 597500 + }, + { + "epoch": 0.18589700746230378, + "grad_norm": 10.186339378356934, + "learning_rate": 4.690171654229494e-05, + "loss": 3.0602, + "step": 598000 + }, + { + "epoch": 0.18605243974279065, + "grad_norm": 6.906607151031494, + "learning_rate": 4.6899126004286824e-05, + "loss": 3.0745, + "step": 598500 + }, + { + "epoch": 0.18620787202327754, + "grad_norm": 11.584217071533203, + "learning_rate": 4.689653546627871e-05, + "loss": 3.1052, + "step": 599000 + }, + { + "epoch": 0.1863633043037644, + "grad_norm": 8.42156982421875, + "learning_rate": 4.689394492827059e-05, + "loss": 3.1127, + "step": 599500 + }, + { + "epoch": 0.18651873658425128, + "grad_norm": 8.581707000732422, + "learning_rate": 4.689135439026248e-05, + "loss": 3.1319, + "step": 600000 + }, + { + "epoch": 0.18667416886473817, + "grad_norm": 10.01965045928955, + "learning_rate": 4.6888763852254366e-05, + "loss": 3.1419, + "step": 600500 + }, + { + "epoch": 0.18682960114522504, + "grad_norm": 7.474636077880859, + "learning_rate": 4.688617331424625e-05, + "loss": 3.084, + "step": 601000 + }, + { + "epoch": 0.1869850334257119, + "grad_norm": 8.288585662841797, + "learning_rate": 4.688358277623814e-05, + "loss": 3.142, + "step": 601500 + }, + { + "epoch": 0.1871404657061988, + "grad_norm": 5.315427303314209, + "learning_rate": 4.688099223823002e-05, + "loss": 3.0844, + "step": 602000 + }, + { + "epoch": 0.18729589798668567, + "grad_norm": 8.417303085327148, + "learning_rate": 4.687840170022191e-05, + "loss": 3.1128, + "step": 602500 + }, + { + "epoch": 0.18745133026717253, + "grad_norm": 7.091259002685547, + "learning_rate": 4.6875811162213795e-05, + "loss": 3.1041, + "step": 603000 + }, + { + "epoch": 0.18760676254765943, + "grad_norm": 11.023924827575684, + "learning_rate": 4.6873220624205676e-05, + "loss": 3.1085, + "step": 603500 + }, + { + "epoch": 0.1877621948281463, + "grad_norm": 19.397777557373047, + "learning_rate": 4.687063008619756e-05, + "loss": 3.1054, + "step": 604000 + }, + { + "epoch": 0.18791762710863316, + "grad_norm": 6.609205722808838, + "learning_rate": 4.686803954818945e-05, + "loss": 3.1292, + "step": 604500 + }, + { + "epoch": 0.18807305938912006, + "grad_norm": 11.93332290649414, + "learning_rate": 4.686544901018134e-05, + "loss": 3.092, + "step": 605000 + }, + { + "epoch": 0.18822849166960692, + "grad_norm": 9.277226448059082, + "learning_rate": 4.6862858472173224e-05, + "loss": 3.1102, + "step": 605500 + }, + { + "epoch": 0.1883839239500938, + "grad_norm": 6.372447490692139, + "learning_rate": 4.6860267934165105e-05, + "loss": 3.1027, + "step": 606000 + }, + { + "epoch": 0.18853935623058068, + "grad_norm": 8.540204048156738, + "learning_rate": 4.685767739615699e-05, + "loss": 3.0951, + "step": 606500 + }, + { + "epoch": 0.18869478851106755, + "grad_norm": 8.157196998596191, + "learning_rate": 4.685508685814888e-05, + "loss": 3.1143, + "step": 607000 + }, + { + "epoch": 0.18885022079155442, + "grad_norm": 5.317485809326172, + "learning_rate": 4.685249632014076e-05, + "loss": 3.1155, + "step": 607500 + }, + { + "epoch": 0.1890056530720413, + "grad_norm": 8.852375984191895, + "learning_rate": 4.6849905782132646e-05, + "loss": 3.1158, + "step": 608000 + }, + { + "epoch": 0.18916108535252818, + "grad_norm": 9.537945747375488, + "learning_rate": 4.6847315244124534e-05, + "loss": 3.1036, + "step": 608500 + }, + { + "epoch": 0.18931651763301505, + "grad_norm": 8.130046844482422, + "learning_rate": 4.6844724706116414e-05, + "loss": 3.1071, + "step": 609000 + }, + { + "epoch": 0.18947194991350194, + "grad_norm": 7.693198204040527, + "learning_rate": 4.68421341681083e-05, + "loss": 3.1231, + "step": 609500 + }, + { + "epoch": 0.1896273821939888, + "grad_norm": 7.169419765472412, + "learning_rate": 4.683954363010019e-05, + "loss": 3.0686, + "step": 610000 + }, + { + "epoch": 0.18978281447447568, + "grad_norm": 6.47094202041626, + "learning_rate": 4.6836953092092075e-05, + "loss": 3.1164, + "step": 610500 + }, + { + "epoch": 0.18993824675496257, + "grad_norm": 8.56041145324707, + "learning_rate": 4.683436255408396e-05, + "loss": 3.1385, + "step": 611000 + }, + { + "epoch": 0.19009367903544944, + "grad_norm": 8.093185424804688, + "learning_rate": 4.683177201607584e-05, + "loss": 3.1503, + "step": 611500 + }, + { + "epoch": 0.1902491113159363, + "grad_norm": 8.387419700622559, + "learning_rate": 4.682918147806773e-05, + "loss": 3.114, + "step": 612000 + }, + { + "epoch": 0.1904045435964232, + "grad_norm": 9.933442115783691, + "learning_rate": 4.682659094005962e-05, + "loss": 3.1623, + "step": 612500 + }, + { + "epoch": 0.19055997587691006, + "grad_norm": 6.170466423034668, + "learning_rate": 4.68240004020515e-05, + "loss": 3.1514, + "step": 613000 + }, + { + "epoch": 0.19071540815739693, + "grad_norm": 8.088785171508789, + "learning_rate": 4.6821409864043385e-05, + "loss": 3.0966, + "step": 613500 + }, + { + "epoch": 0.19087084043788383, + "grad_norm": 5.865373134613037, + "learning_rate": 4.681881932603527e-05, + "loss": 3.129, + "step": 614000 + }, + { + "epoch": 0.1910262727183707, + "grad_norm": 6.579500198364258, + "learning_rate": 4.681622878802715e-05, + "loss": 3.0865, + "step": 614500 + }, + { + "epoch": 0.19118170499885756, + "grad_norm": 6.7003302574157715, + "learning_rate": 4.6813638250019046e-05, + "loss": 3.0704, + "step": 615000 + }, + { + "epoch": 0.19133713727934445, + "grad_norm": 7.265932559967041, + "learning_rate": 4.681104771201093e-05, + "loss": 3.1173, + "step": 615500 + }, + { + "epoch": 0.19149256955983132, + "grad_norm": 8.902778625488281, + "learning_rate": 4.6808457174002814e-05, + "loss": 3.0918, + "step": 616000 + }, + { + "epoch": 0.1916480018403182, + "grad_norm": 15.606313705444336, + "learning_rate": 4.68058666359947e-05, + "loss": 3.1246, + "step": 616500 + }, + { + "epoch": 0.19180343412080508, + "grad_norm": 7.798036098480225, + "learning_rate": 4.680327609798659e-05, + "loss": 3.0944, + "step": 617000 + }, + { + "epoch": 0.19195886640129195, + "grad_norm": 7.870765686035156, + "learning_rate": 4.680068555997847e-05, + "loss": 3.1284, + "step": 617500 + }, + { + "epoch": 0.19211429868177882, + "grad_norm": 8.424897193908691, + "learning_rate": 4.6798095021970356e-05, + "loss": 3.1568, + "step": 618000 + }, + { + "epoch": 0.1922697309622657, + "grad_norm": 11.448667526245117, + "learning_rate": 4.6795504483962236e-05, + "loss": 3.0982, + "step": 618500 + }, + { + "epoch": 0.19242516324275258, + "grad_norm": 32.4866828918457, + "learning_rate": 4.679291394595412e-05, + "loss": 3.11, + "step": 619000 + }, + { + "epoch": 0.19258059552323945, + "grad_norm": 9.344354629516602, + "learning_rate": 4.679032340794601e-05, + "loss": 3.0993, + "step": 619500 + }, + { + "epoch": 0.19273602780372634, + "grad_norm": 6.214859962463379, + "learning_rate": 4.67877328699379e-05, + "loss": 3.1068, + "step": 620000 + }, + { + "epoch": 0.1928914600842132, + "grad_norm": 8.469339370727539, + "learning_rate": 4.6785142331929785e-05, + "loss": 3.1634, + "step": 620500 + }, + { + "epoch": 0.19304689236470007, + "grad_norm": 8.794650077819824, + "learning_rate": 4.678255179392167e-05, + "loss": 3.1363, + "step": 621000 + }, + { + "epoch": 0.19320232464518697, + "grad_norm": 6.498732566833496, + "learning_rate": 4.677996125591355e-05, + "loss": 3.1237, + "step": 621500 + }, + { + "epoch": 0.19335775692567383, + "grad_norm": 8.134804725646973, + "learning_rate": 4.677737071790544e-05, + "loss": 3.1215, + "step": 622000 + }, + { + "epoch": 0.1935131892061607, + "grad_norm": 10.99081039428711, + "learning_rate": 4.6774780179897326e-05, + "loss": 3.1007, + "step": 622500 + }, + { + "epoch": 0.1936686214866476, + "grad_norm": 6.220900058746338, + "learning_rate": 4.677218964188921e-05, + "loss": 3.1079, + "step": 623000 + }, + { + "epoch": 0.19382405376713446, + "grad_norm": 10.511489868164062, + "learning_rate": 4.6769599103881094e-05, + "loss": 3.1164, + "step": 623500 + }, + { + "epoch": 0.19397948604762133, + "grad_norm": 7.429703712463379, + "learning_rate": 4.6767008565872974e-05, + "loss": 3.114, + "step": 624000 + }, + { + "epoch": 0.19413491832810822, + "grad_norm": 11.511526107788086, + "learning_rate": 4.676441802786486e-05, + "loss": 3.1242, + "step": 624500 + }, + { + "epoch": 0.1942903506085951, + "grad_norm": 7.904088020324707, + "learning_rate": 4.6761827489856755e-05, + "loss": 3.1143, + "step": 625000 + }, + { + "epoch": 0.19444578288908196, + "grad_norm": 8.73144817352295, + "learning_rate": 4.6759236951848636e-05, + "loss": 3.1305, + "step": 625500 + }, + { + "epoch": 0.19460121516956885, + "grad_norm": 10.603700637817383, + "learning_rate": 4.675664641384052e-05, + "loss": 3.101, + "step": 626000 + }, + { + "epoch": 0.19475664745005572, + "grad_norm": 19.32013702392578, + "learning_rate": 4.675405587583241e-05, + "loss": 3.1167, + "step": 626500 + }, + { + "epoch": 0.1949120797305426, + "grad_norm": 10.381686210632324, + "learning_rate": 4.675146533782429e-05, + "loss": 3.0909, + "step": 627000 + }, + { + "epoch": 0.19506751201102948, + "grad_norm": 7.783875942230225, + "learning_rate": 4.674887479981618e-05, + "loss": 3.1433, + "step": 627500 + }, + { + "epoch": 0.19522294429151635, + "grad_norm": 6.744434356689453, + "learning_rate": 4.6746284261808065e-05, + "loss": 3.1186, + "step": 628000 + }, + { + "epoch": 0.19537837657200322, + "grad_norm": 8.73886489868164, + "learning_rate": 4.6743693723799945e-05, + "loss": 3.0865, + "step": 628500 + }, + { + "epoch": 0.1955338088524901, + "grad_norm": 8.231348991394043, + "learning_rate": 4.674110318579183e-05, + "loss": 3.1045, + "step": 629000 + }, + { + "epoch": 0.19568924113297698, + "grad_norm": 8.645265579223633, + "learning_rate": 4.673851264778372e-05, + "loss": 3.1258, + "step": 629500 + }, + { + "epoch": 0.19584467341346384, + "grad_norm": 6.88942289352417, + "learning_rate": 4.6735922109775607e-05, + "loss": 3.085, + "step": 630000 + }, + { + "epoch": 0.19600010569395074, + "grad_norm": 15.651482582092285, + "learning_rate": 4.6733331571767494e-05, + "loss": 3.0744, + "step": 630500 + }, + { + "epoch": 0.1961555379744376, + "grad_norm": 8.434839248657227, + "learning_rate": 4.6730741033759374e-05, + "loss": 3.1406, + "step": 631000 + }, + { + "epoch": 0.19631097025492447, + "grad_norm": 8.411150932312012, + "learning_rate": 4.672815049575126e-05, + "loss": 3.0905, + "step": 631500 + }, + { + "epoch": 0.19646640253541137, + "grad_norm": 9.964964866638184, + "learning_rate": 4.672555995774315e-05, + "loss": 3.1622, + "step": 632000 + }, + { + "epoch": 0.19662183481589823, + "grad_norm": 6.8978986740112305, + "learning_rate": 4.672296941973503e-05, + "loss": 3.0658, + "step": 632500 + }, + { + "epoch": 0.1967772670963851, + "grad_norm": 7.878255844116211, + "learning_rate": 4.6720378881726916e-05, + "loss": 3.1184, + "step": 633000 + }, + { + "epoch": 0.196932699376872, + "grad_norm": 6.41415548324585, + "learning_rate": 4.67177883437188e-05, + "loss": 3.1197, + "step": 633500 + }, + { + "epoch": 0.19708813165735886, + "grad_norm": 7.320611476898193, + "learning_rate": 4.6715197805710683e-05, + "loss": 3.1236, + "step": 634000 + }, + { + "epoch": 0.19724356393784573, + "grad_norm": 7.793437957763672, + "learning_rate": 4.671260726770257e-05, + "loss": 3.0959, + "step": 634500 + }, + { + "epoch": 0.19739899621833262, + "grad_norm": 8.220548629760742, + "learning_rate": 4.6710016729694465e-05, + "loss": 3.1136, + "step": 635000 + }, + { + "epoch": 0.1975544284988195, + "grad_norm": 8.927963256835938, + "learning_rate": 4.6707426191686345e-05, + "loss": 3.1315, + "step": 635500 + }, + { + "epoch": 0.19770986077930636, + "grad_norm": 10.460070610046387, + "learning_rate": 4.670483565367823e-05, + "loss": 3.089, + "step": 636000 + }, + { + "epoch": 0.19786529305979325, + "grad_norm": 16.385942459106445, + "learning_rate": 4.670224511567011e-05, + "loss": 3.1076, + "step": 636500 + }, + { + "epoch": 0.19802072534028012, + "grad_norm": 7.464322090148926, + "learning_rate": 4.6699654577662e-05, + "loss": 3.0975, + "step": 637000 + }, + { + "epoch": 0.19817615762076699, + "grad_norm": 11.456690788269043, + "learning_rate": 4.669706403965389e-05, + "loss": 3.1186, + "step": 637500 + }, + { + "epoch": 0.19833158990125388, + "grad_norm": 9.586854934692383, + "learning_rate": 4.669447350164577e-05, + "loss": 3.1563, + "step": 638000 + }, + { + "epoch": 0.19848702218174075, + "grad_norm": 8.113604545593262, + "learning_rate": 4.6691882963637654e-05, + "loss": 3.0938, + "step": 638500 + }, + { + "epoch": 0.1986424544622276, + "grad_norm": 7.602396011352539, + "learning_rate": 4.668929242562954e-05, + "loss": 3.1144, + "step": 639000 + }, + { + "epoch": 0.1987978867427145, + "grad_norm": 10.786232948303223, + "learning_rate": 4.668670188762143e-05, + "loss": 3.1225, + "step": 639500 + }, + { + "epoch": 0.19895331902320137, + "grad_norm": 7.190712928771973, + "learning_rate": 4.6684111349613316e-05, + "loss": 3.0956, + "step": 640000 + }, + { + "epoch": 0.19910875130368824, + "grad_norm": 9.038161277770996, + "learning_rate": 4.66815208116052e-05, + "loss": 3.1363, + "step": 640500 + }, + { + "epoch": 0.19926418358417514, + "grad_norm": 6.707172870635986, + "learning_rate": 4.667893027359708e-05, + "loss": 3.0982, + "step": 641000 + }, + { + "epoch": 0.199419615864662, + "grad_norm": 6.9285807609558105, + "learning_rate": 4.667633973558897e-05, + "loss": 3.1068, + "step": 641500 + }, + { + "epoch": 0.19957504814514887, + "grad_norm": 8.706789016723633, + "learning_rate": 4.667374919758085e-05, + "loss": 3.1087, + "step": 642000 + }, + { + "epoch": 0.19973048042563576, + "grad_norm": 9.911788940429688, + "learning_rate": 4.667115865957274e-05, + "loss": 3.1369, + "step": 642500 + }, + { + "epoch": 0.19988591270612263, + "grad_norm": 8.50448989868164, + "learning_rate": 4.6668568121564625e-05, + "loss": 3.1295, + "step": 643000 + }, + { + "epoch": 0.2000413449866095, + "grad_norm": 7.57549524307251, + "learning_rate": 4.6665977583556506e-05, + "loss": 3.0772, + "step": 643500 + }, + { + "epoch": 0.2001967772670964, + "grad_norm": 7.324392795562744, + "learning_rate": 4.666338704554839e-05, + "loss": 3.0912, + "step": 644000 + }, + { + "epoch": 0.20035220954758326, + "grad_norm": 9.380905151367188, + "learning_rate": 4.666079650754028e-05, + "loss": 3.139, + "step": 644500 + }, + { + "epoch": 0.20050764182807013, + "grad_norm": 17.502431869506836, + "learning_rate": 4.665820596953217e-05, + "loss": 3.0795, + "step": 645000 + }, + { + "epoch": 0.20066307410855702, + "grad_norm": 8.509015083312988, + "learning_rate": 4.6655615431524054e-05, + "loss": 3.1099, + "step": 645500 + }, + { + "epoch": 0.2008185063890439, + "grad_norm": 9.834274291992188, + "learning_rate": 4.665302489351594e-05, + "loss": 3.1386, + "step": 646000 + }, + { + "epoch": 0.20097393866953076, + "grad_norm": 7.484282970428467, + "learning_rate": 4.665043435550782e-05, + "loss": 3.0728, + "step": 646500 + }, + { + "epoch": 0.20112937095001765, + "grad_norm": 7.628221035003662, + "learning_rate": 4.664784381749971e-05, + "loss": 3.0759, + "step": 647000 + }, + { + "epoch": 0.20128480323050452, + "grad_norm": 10.678404808044434, + "learning_rate": 4.664525327949159e-05, + "loss": 3.1239, + "step": 647500 + }, + { + "epoch": 0.20144023551099138, + "grad_norm": 9.538772583007812, + "learning_rate": 4.6642662741483476e-05, + "loss": 3.1008, + "step": 648000 + }, + { + "epoch": 0.20159566779147828, + "grad_norm": 8.9669771194458, + "learning_rate": 4.6640072203475363e-05, + "loss": 3.0901, + "step": 648500 + }, + { + "epoch": 0.20175110007196514, + "grad_norm": 8.266900062561035, + "learning_rate": 4.663748166546725e-05, + "loss": 3.117, + "step": 649000 + }, + { + "epoch": 0.201906532352452, + "grad_norm": 11.236969947814941, + "learning_rate": 4.663489112745914e-05, + "loss": 3.0774, + "step": 649500 + }, + { + "epoch": 0.2020619646329389, + "grad_norm": 8.181771278381348, + "learning_rate": 4.6632300589451025e-05, + "loss": 3.1512, + "step": 650000 + }, + { + "epoch": 0.20221739691342577, + "grad_norm": 20.125614166259766, + "learning_rate": 4.6629710051442905e-05, + "loss": 3.0703, + "step": 650500 + }, + { + "epoch": 0.20237282919391264, + "grad_norm": 10.01820182800293, + "learning_rate": 4.662711951343479e-05, + "loss": 3.1069, + "step": 651000 + }, + { + "epoch": 0.20252826147439953, + "grad_norm": 9.350025177001953, + "learning_rate": 4.662452897542668e-05, + "loss": 3.1431, + "step": 651500 + }, + { + "epoch": 0.2026836937548864, + "grad_norm": 7.8117194175720215, + "learning_rate": 4.662193843741856e-05, + "loss": 3.1004, + "step": 652000 + }, + { + "epoch": 0.20283912603537327, + "grad_norm": 9.509490966796875, + "learning_rate": 4.661934789941045e-05, + "loss": 3.1239, + "step": 652500 + }, + { + "epoch": 0.20299455831586016, + "grad_norm": 7.560518741607666, + "learning_rate": 4.6616757361402334e-05, + "loss": 3.0942, + "step": 653000 + }, + { + "epoch": 0.20314999059634703, + "grad_norm": 8.404268264770508, + "learning_rate": 4.6614166823394215e-05, + "loss": 3.1259, + "step": 653500 + }, + { + "epoch": 0.2033054228768339, + "grad_norm": 10.376083374023438, + "learning_rate": 4.66115762853861e-05, + "loss": 3.0969, + "step": 654000 + }, + { + "epoch": 0.2034608551573208, + "grad_norm": 10.82886028289795, + "learning_rate": 4.660898574737799e-05, + "loss": 3.0984, + "step": 654500 + }, + { + "epoch": 0.20361628743780766, + "grad_norm": 8.26655101776123, + "learning_rate": 4.6606395209369876e-05, + "loss": 3.1, + "step": 655000 + }, + { + "epoch": 0.20377171971829453, + "grad_norm": 10.772130966186523, + "learning_rate": 4.660380467136176e-05, + "loss": 3.1478, + "step": 655500 + }, + { + "epoch": 0.20392715199878142, + "grad_norm": 53.16033935546875, + "learning_rate": 4.6601214133353644e-05, + "loss": 3.1226, + "step": 656000 + }, + { + "epoch": 0.2040825842792683, + "grad_norm": 9.038418769836426, + "learning_rate": 4.659862359534553e-05, + "loss": 3.1091, + "step": 656500 + }, + { + "epoch": 0.20423801655975515, + "grad_norm": 8.938298225402832, + "learning_rate": 4.659603305733742e-05, + "loss": 3.1024, + "step": 657000 + }, + { + "epoch": 0.20439344884024205, + "grad_norm": 6.9254865646362305, + "learning_rate": 4.65934425193293e-05, + "loss": 3.0992, + "step": 657500 + }, + { + "epoch": 0.20454888112072891, + "grad_norm": 6.589847564697266, + "learning_rate": 4.6590851981321186e-05, + "loss": 3.0518, + "step": 658000 + }, + { + "epoch": 0.20470431340121578, + "grad_norm": 8.987970352172852, + "learning_rate": 4.658826144331307e-05, + "loss": 3.1376, + "step": 658500 + }, + { + "epoch": 0.20485974568170268, + "grad_norm": 5.877768516540527, + "learning_rate": 4.658567090530496e-05, + "loss": 3.1294, + "step": 659000 + }, + { + "epoch": 0.20501517796218954, + "grad_norm": 13.909720420837402, + "learning_rate": 4.658308036729685e-05, + "loss": 3.1336, + "step": 659500 + }, + { + "epoch": 0.2051706102426764, + "grad_norm": 6.741101264953613, + "learning_rate": 4.658048982928873e-05, + "loss": 3.105, + "step": 660000 + }, + { + "epoch": 0.2053260425231633, + "grad_norm": 8.147170066833496, + "learning_rate": 4.6577899291280615e-05, + "loss": 3.0758, + "step": 660500 + }, + { + "epoch": 0.20548147480365017, + "grad_norm": 9.793052673339844, + "learning_rate": 4.65753087532725e-05, + "loss": 3.0831, + "step": 661000 + }, + { + "epoch": 0.20563690708413704, + "grad_norm": 7.801171779632568, + "learning_rate": 4.657271821526438e-05, + "loss": 3.0967, + "step": 661500 + }, + { + "epoch": 0.20579233936462393, + "grad_norm": 7.568404674530029, + "learning_rate": 4.657012767725627e-05, + "loss": 3.0518, + "step": 662000 + }, + { + "epoch": 0.2059477716451108, + "grad_norm": 11.307735443115234, + "learning_rate": 4.6567537139248156e-05, + "loss": 3.1014, + "step": 662500 + }, + { + "epoch": 0.20610320392559767, + "grad_norm": 6.879087924957275, + "learning_rate": 4.656494660124004e-05, + "loss": 3.1063, + "step": 663000 + }, + { + "epoch": 0.20625863620608456, + "grad_norm": 7.387253761291504, + "learning_rate": 4.6562356063231924e-05, + "loss": 3.1252, + "step": 663500 + }, + { + "epoch": 0.20641406848657143, + "grad_norm": 7.845255374908447, + "learning_rate": 4.655976552522381e-05, + "loss": 3.1038, + "step": 664000 + }, + { + "epoch": 0.2065695007670583, + "grad_norm": 9.595666885375977, + "learning_rate": 4.65571749872157e-05, + "loss": 3.1139, + "step": 664500 + }, + { + "epoch": 0.2067249330475452, + "grad_norm": 7.816808223724365, + "learning_rate": 4.6554584449207585e-05, + "loss": 3.1304, + "step": 665000 + }, + { + "epoch": 0.20688036532803206, + "grad_norm": 7.084340572357178, + "learning_rate": 4.655199391119947e-05, + "loss": 3.109, + "step": 665500 + }, + { + "epoch": 0.20703579760851892, + "grad_norm": 8.81795597076416, + "learning_rate": 4.654940337319135e-05, + "loss": 3.0175, + "step": 666000 + }, + { + "epoch": 0.20719122988900582, + "grad_norm": 7.114587306976318, + "learning_rate": 4.654681283518324e-05, + "loss": 3.0732, + "step": 666500 + }, + { + "epoch": 0.20734666216949268, + "grad_norm": 8.166281700134277, + "learning_rate": 4.654422229717512e-05, + "loss": 3.1443, + "step": 667000 + }, + { + "epoch": 0.20750209444997955, + "grad_norm": 9.314291000366211, + "learning_rate": 4.654163175916701e-05, + "loss": 3.1024, + "step": 667500 + }, + { + "epoch": 0.20765752673046645, + "grad_norm": 13.346175193786621, + "learning_rate": 4.6539041221158895e-05, + "loss": 3.1338, + "step": 668000 + }, + { + "epoch": 0.2078129590109533, + "grad_norm": 10.075983047485352, + "learning_rate": 4.653645068315078e-05, + "loss": 3.116, + "step": 668500 + }, + { + "epoch": 0.20796839129144018, + "grad_norm": 7.880954265594482, + "learning_rate": 4.653386014514267e-05, + "loss": 3.0878, + "step": 669000 + }, + { + "epoch": 0.20812382357192707, + "grad_norm": 8.865723609924316, + "learning_rate": 4.6531269607134556e-05, + "loss": 3.1158, + "step": 669500 + }, + { + "epoch": 0.20827925585241394, + "grad_norm": 6.158698558807373, + "learning_rate": 4.6528679069126437e-05, + "loss": 3.123, + "step": 670000 + }, + { + "epoch": 0.2084346881329008, + "grad_norm": 6.745934963226318, + "learning_rate": 4.6526088531118324e-05, + "loss": 3.1105, + "step": 670500 + }, + { + "epoch": 0.2085901204133877, + "grad_norm": 9.43136215209961, + "learning_rate": 4.652349799311021e-05, + "loss": 3.1195, + "step": 671000 + }, + { + "epoch": 0.20874555269387457, + "grad_norm": 10.232465744018555, + "learning_rate": 4.652090745510209e-05, + "loss": 3.0883, + "step": 671500 + }, + { + "epoch": 0.20890098497436144, + "grad_norm": 6.815830230712891, + "learning_rate": 4.651831691709398e-05, + "loss": 3.0865, + "step": 672000 + }, + { + "epoch": 0.20905641725484833, + "grad_norm": 15.94247055053711, + "learning_rate": 4.651572637908586e-05, + "loss": 3.1254, + "step": 672500 + }, + { + "epoch": 0.2092118495353352, + "grad_norm": 8.117185592651367, + "learning_rate": 4.6513135841077746e-05, + "loss": 3.1282, + "step": 673000 + }, + { + "epoch": 0.20936728181582206, + "grad_norm": 9.324047088623047, + "learning_rate": 4.651054530306963e-05, + "loss": 3.1052, + "step": 673500 + }, + { + "epoch": 0.20952271409630896, + "grad_norm": 17.477428436279297, + "learning_rate": 4.650795476506152e-05, + "loss": 3.1213, + "step": 674000 + }, + { + "epoch": 0.20967814637679583, + "grad_norm": 15.528051376342773, + "learning_rate": 4.650536422705341e-05, + "loss": 3.1019, + "step": 674500 + }, + { + "epoch": 0.2098335786572827, + "grad_norm": 5.54828405380249, + "learning_rate": 4.6502773689045295e-05, + "loss": 3.0634, + "step": 675000 + }, + { + "epoch": 0.2099890109377696, + "grad_norm": 8.286291122436523, + "learning_rate": 4.6500183151037175e-05, + "loss": 3.1015, + "step": 675500 + }, + { + "epoch": 0.21014444321825645, + "grad_norm": 9.746342658996582, + "learning_rate": 4.649759261302906e-05, + "loss": 3.1316, + "step": 676000 + }, + { + "epoch": 0.21029987549874332, + "grad_norm": 11.046323776245117, + "learning_rate": 4.649500207502095e-05, + "loss": 3.1306, + "step": 676500 + }, + { + "epoch": 0.21045530777923022, + "grad_norm": 7.759479522705078, + "learning_rate": 4.649241153701283e-05, + "loss": 3.0955, + "step": 677000 + }, + { + "epoch": 0.21061074005971708, + "grad_norm": 10.931486129760742, + "learning_rate": 4.648982099900472e-05, + "loss": 3.0877, + "step": 677500 + }, + { + "epoch": 0.21076617234020395, + "grad_norm": 15.514293670654297, + "learning_rate": 4.64872304609966e-05, + "loss": 3.0544, + "step": 678000 + }, + { + "epoch": 0.21092160462069084, + "grad_norm": 11.440436363220215, + "learning_rate": 4.648463992298849e-05, + "loss": 3.0734, + "step": 678500 + }, + { + "epoch": 0.2110770369011777, + "grad_norm": 8.3727445602417, + "learning_rate": 4.648204938498038e-05, + "loss": 3.201, + "step": 679000 + }, + { + "epoch": 0.21123246918166458, + "grad_norm": 7.731194496154785, + "learning_rate": 4.647945884697226e-05, + "loss": 3.0745, + "step": 679500 + }, + { + "epoch": 0.21138790146215147, + "grad_norm": 9.413427352905273, + "learning_rate": 4.6476868308964146e-05, + "loss": 3.0879, + "step": 680000 + }, + { + "epoch": 0.21154333374263834, + "grad_norm": 6.233428001403809, + "learning_rate": 4.647427777095603e-05, + "loss": 3.0924, + "step": 680500 + }, + { + "epoch": 0.2116987660231252, + "grad_norm": 9.045272827148438, + "learning_rate": 4.647168723294791e-05, + "loss": 3.1041, + "step": 681000 + }, + { + "epoch": 0.2118541983036121, + "grad_norm": 8.164320945739746, + "learning_rate": 4.64690966949398e-05, + "loss": 3.0849, + "step": 681500 + }, + { + "epoch": 0.21200963058409897, + "grad_norm": 8.098261833190918, + "learning_rate": 4.646650615693169e-05, + "loss": 3.1213, + "step": 682000 + }, + { + "epoch": 0.21216506286458583, + "grad_norm": 10.136384010314941, + "learning_rate": 4.646391561892357e-05, + "loss": 3.1133, + "step": 682500 + }, + { + "epoch": 0.21232049514507273, + "grad_norm": 7.637228965759277, + "learning_rate": 4.6461325080915455e-05, + "loss": 3.1039, + "step": 683000 + }, + { + "epoch": 0.2124759274255596, + "grad_norm": 7.419344902038574, + "learning_rate": 4.645873454290734e-05, + "loss": 3.07, + "step": 683500 + }, + { + "epoch": 0.21263135970604646, + "grad_norm": 11.413835525512695, + "learning_rate": 4.645614400489923e-05, + "loss": 3.0568, + "step": 684000 + }, + { + "epoch": 0.21278679198653336, + "grad_norm": 7.721258640289307, + "learning_rate": 4.6453553466891117e-05, + "loss": 3.0925, + "step": 684500 + }, + { + "epoch": 0.21294222426702022, + "grad_norm": 9.015264511108398, + "learning_rate": 4.6450962928883e-05, + "loss": 3.0835, + "step": 685000 + }, + { + "epoch": 0.2130976565475071, + "grad_norm": 10.107800483703613, + "learning_rate": 4.6448372390874884e-05, + "loss": 3.0894, + "step": 685500 + }, + { + "epoch": 0.213253088827994, + "grad_norm": 8.337308883666992, + "learning_rate": 4.644578185286677e-05, + "loss": 3.1037, + "step": 686000 + }, + { + "epoch": 0.21340852110848085, + "grad_norm": 6.401755332946777, + "learning_rate": 4.644319131485865e-05, + "loss": 3.0811, + "step": 686500 + }, + { + "epoch": 0.21356395338896772, + "grad_norm": 24.287160873413086, + "learning_rate": 4.644060077685054e-05, + "loss": 3.0803, + "step": 687000 + }, + { + "epoch": 0.21371938566945461, + "grad_norm": 7.071877956390381, + "learning_rate": 4.6438010238842426e-05, + "loss": 3.0929, + "step": 687500 + }, + { + "epoch": 0.21387481794994148, + "grad_norm": 11.21955680847168, + "learning_rate": 4.6435419700834306e-05, + "loss": 3.0439, + "step": 688000 + }, + { + "epoch": 0.21403025023042835, + "grad_norm": 8.287102699279785, + "learning_rate": 4.64328291628262e-05, + "loss": 3.0898, + "step": 688500 + }, + { + "epoch": 0.21418568251091524, + "grad_norm": 8.715441703796387, + "learning_rate": 4.643023862481809e-05, + "loss": 3.1533, + "step": 689000 + }, + { + "epoch": 0.2143411147914021, + "grad_norm": 7.06083869934082, + "learning_rate": 4.642764808680997e-05, + "loss": 3.0509, + "step": 689500 + }, + { + "epoch": 0.21449654707188898, + "grad_norm": 8.16001033782959, + "learning_rate": 4.6425057548801855e-05, + "loss": 3.0634, + "step": 690000 + }, + { + "epoch": 0.21465197935237587, + "grad_norm": 7.778217792510986, + "learning_rate": 4.6422467010793735e-05, + "loss": 3.0845, + "step": 690500 + }, + { + "epoch": 0.21480741163286274, + "grad_norm": 9.14334774017334, + "learning_rate": 4.641987647278562e-05, + "loss": 3.1764, + "step": 691000 + }, + { + "epoch": 0.2149628439133496, + "grad_norm": 6.883997440338135, + "learning_rate": 4.641728593477751e-05, + "loss": 3.1063, + "step": 691500 + }, + { + "epoch": 0.2151182761938365, + "grad_norm": 13.946197509765625, + "learning_rate": 4.641469539676939e-05, + "loss": 3.122, + "step": 692000 + }, + { + "epoch": 0.21527370847432337, + "grad_norm": 6.9886794090271, + "learning_rate": 4.641210485876128e-05, + "loss": 3.0598, + "step": 692500 + }, + { + "epoch": 0.21542914075481023, + "grad_norm": 7.6012187004089355, + "learning_rate": 4.6409514320753164e-05, + "loss": 3.0669, + "step": 693000 + }, + { + "epoch": 0.21558457303529713, + "grad_norm": 8.917882919311523, + "learning_rate": 4.640692378274505e-05, + "loss": 3.1135, + "step": 693500 + }, + { + "epoch": 0.215740005315784, + "grad_norm": 9.358829498291016, + "learning_rate": 4.640433324473694e-05, + "loss": 3.0682, + "step": 694000 + }, + { + "epoch": 0.21589543759627086, + "grad_norm": 9.055254936218262, + "learning_rate": 4.6401742706728826e-05, + "loss": 3.08, + "step": 694500 + }, + { + "epoch": 0.21605086987675776, + "grad_norm": 6.561962127685547, + "learning_rate": 4.6399152168720706e-05, + "loss": 3.1032, + "step": 695000 + }, + { + "epoch": 0.21620630215724462, + "grad_norm": 10.674737930297852, + "learning_rate": 4.639656163071259e-05, + "loss": 3.1046, + "step": 695500 + }, + { + "epoch": 0.2163617344377315, + "grad_norm": 7.742291450500488, + "learning_rate": 4.6393971092704474e-05, + "loss": 3.122, + "step": 696000 + }, + { + "epoch": 0.21651716671821838, + "grad_norm": 10.465185165405273, + "learning_rate": 4.639138055469636e-05, + "loss": 3.0992, + "step": 696500 + }, + { + "epoch": 0.21667259899870525, + "grad_norm": 9.96226978302002, + "learning_rate": 4.638879001668825e-05, + "loss": 3.0769, + "step": 697000 + }, + { + "epoch": 0.21682803127919212, + "grad_norm": 18.011751174926758, + "learning_rate": 4.638619947868013e-05, + "loss": 3.0894, + "step": 697500 + }, + { + "epoch": 0.216983463559679, + "grad_norm": 5.298544883728027, + "learning_rate": 4.6383608940672015e-05, + "loss": 3.0613, + "step": 698000 + }, + { + "epoch": 0.21713889584016588, + "grad_norm": 10.325242042541504, + "learning_rate": 4.638101840266391e-05, + "loss": 3.1052, + "step": 698500 + }, + { + "epoch": 0.21729432812065275, + "grad_norm": 7.849708080291748, + "learning_rate": 4.637842786465579e-05, + "loss": 3.0994, + "step": 699000 + }, + { + "epoch": 0.21744976040113964, + "grad_norm": 10.420981407165527, + "learning_rate": 4.637583732664768e-05, + "loss": 3.0917, + "step": 699500 + }, + { + "epoch": 0.2176051926816265, + "grad_norm": 9.965137481689453, + "learning_rate": 4.6373246788639564e-05, + "loss": 3.0268, + "step": 700000 + }, + { + "epoch": 0.21776062496211337, + "grad_norm": 13.797762870788574, + "learning_rate": 4.6370656250631444e-05, + "loss": 3.101, + "step": 700500 + }, + { + "epoch": 0.21791605724260027, + "grad_norm": 8.923644065856934, + "learning_rate": 4.636806571262333e-05, + "loss": 3.1235, + "step": 701000 + }, + { + "epoch": 0.21807148952308714, + "grad_norm": 5.629330158233643, + "learning_rate": 4.636547517461522e-05, + "loss": 3.1011, + "step": 701500 + }, + { + "epoch": 0.218226921803574, + "grad_norm": 7.493610382080078, + "learning_rate": 4.63628846366071e-05, + "loss": 3.1071, + "step": 702000 + }, + { + "epoch": 0.2183823540840609, + "grad_norm": 9.79520320892334, + "learning_rate": 4.6360294098598986e-05, + "loss": 3.1047, + "step": 702500 + }, + { + "epoch": 0.21853778636454776, + "grad_norm": 7.958981037139893, + "learning_rate": 4.6357703560590873e-05, + "loss": 3.1056, + "step": 703000 + }, + { + "epoch": 0.21869321864503463, + "grad_norm": 8.38884449005127, + "learning_rate": 4.635511302258276e-05, + "loss": 3.0722, + "step": 703500 + }, + { + "epoch": 0.21884865092552153, + "grad_norm": 10.052433967590332, + "learning_rate": 4.635252248457465e-05, + "loss": 3.1232, + "step": 704000 + }, + { + "epoch": 0.2190040832060084, + "grad_norm": 10.565025329589844, + "learning_rate": 4.634993194656653e-05, + "loss": 3.095, + "step": 704500 + }, + { + "epoch": 0.21915951548649526, + "grad_norm": 7.886307239532471, + "learning_rate": 4.6347341408558415e-05, + "loss": 3.0898, + "step": 705000 + }, + { + "epoch": 0.21931494776698215, + "grad_norm": 8.57146167755127, + "learning_rate": 4.63447508705503e-05, + "loss": 3.0893, + "step": 705500 + }, + { + "epoch": 0.21947038004746902, + "grad_norm": 8.484508514404297, + "learning_rate": 4.634216033254218e-05, + "loss": 3.0752, + "step": 706000 + }, + { + "epoch": 0.2196258123279559, + "grad_norm": 6.265040874481201, + "learning_rate": 4.633956979453407e-05, + "loss": 3.1117, + "step": 706500 + }, + { + "epoch": 0.21978124460844278, + "grad_norm": 10.907879829406738, + "learning_rate": 4.633697925652596e-05, + "loss": 3.069, + "step": 707000 + }, + { + "epoch": 0.21993667688892965, + "grad_norm": 14.39404582977295, + "learning_rate": 4.633438871851784e-05, + "loss": 3.0857, + "step": 707500 + }, + { + "epoch": 0.22009210916941652, + "grad_norm": 7.403165340423584, + "learning_rate": 4.6331798180509725e-05, + "loss": 3.1129, + "step": 708000 + }, + { + "epoch": 0.2202475414499034, + "grad_norm": 26.745594024658203, + "learning_rate": 4.632920764250161e-05, + "loss": 3.1195, + "step": 708500 + }, + { + "epoch": 0.22040297373039028, + "grad_norm": 6.791961669921875, + "learning_rate": 4.63266171044935e-05, + "loss": 3.0961, + "step": 709000 + }, + { + "epoch": 0.22055840601087714, + "grad_norm": 7.5692033767700195, + "learning_rate": 4.6324026566485386e-05, + "loss": 3.0762, + "step": 709500 + }, + { + "epoch": 0.22071383829136404, + "grad_norm": 9.529583930969238, + "learning_rate": 4.6321436028477267e-05, + "loss": 3.1441, + "step": 710000 + }, + { + "epoch": 0.2208692705718509, + "grad_norm": 21.870113372802734, + "learning_rate": 4.6318845490469154e-05, + "loss": 3.1176, + "step": 710500 + }, + { + "epoch": 0.22102470285233777, + "grad_norm": 10.630891799926758, + "learning_rate": 4.631625495246104e-05, + "loss": 3.0682, + "step": 711000 + }, + { + "epoch": 0.22118013513282467, + "grad_norm": 8.902356147766113, + "learning_rate": 4.631366441445292e-05, + "loss": 3.0975, + "step": 711500 + }, + { + "epoch": 0.22133556741331153, + "grad_norm": 6.929142475128174, + "learning_rate": 4.631107387644481e-05, + "loss": 3.1114, + "step": 712000 + }, + { + "epoch": 0.2214909996937984, + "grad_norm": 6.507615089416504, + "learning_rate": 4.6308483338436696e-05, + "loss": 3.1105, + "step": 712500 + }, + { + "epoch": 0.2216464319742853, + "grad_norm": 11.214269638061523, + "learning_rate": 4.630589280042858e-05, + "loss": 3.0336, + "step": 713000 + }, + { + "epoch": 0.22180186425477216, + "grad_norm": 13.986176490783691, + "learning_rate": 4.630330226242047e-05, + "loss": 3.068, + "step": 713500 + }, + { + "epoch": 0.22195729653525903, + "grad_norm": 7.805002212524414, + "learning_rate": 4.630071172441235e-05, + "loss": 3.1262, + "step": 714000 + }, + { + "epoch": 0.22211272881574592, + "grad_norm": 11.751163482666016, + "learning_rate": 4.629812118640424e-05, + "loss": 3.1177, + "step": 714500 + }, + { + "epoch": 0.2222681610962328, + "grad_norm": 8.907294273376465, + "learning_rate": 4.6295530648396124e-05, + "loss": 3.0914, + "step": 715000 + }, + { + "epoch": 0.22242359337671966, + "grad_norm": 7.415829181671143, + "learning_rate": 4.6292940110388005e-05, + "loss": 3.1351, + "step": 715500 + }, + { + "epoch": 0.22257902565720655, + "grad_norm": 9.0299654006958, + "learning_rate": 4.629034957237989e-05, + "loss": 3.0799, + "step": 716000 + }, + { + "epoch": 0.22273445793769342, + "grad_norm": 6.665262699127197, + "learning_rate": 4.628775903437178e-05, + "loss": 3.0735, + "step": 716500 + }, + { + "epoch": 0.2228898902181803, + "grad_norm": 14.683706283569336, + "learning_rate": 4.628516849636366e-05, + "loss": 3.089, + "step": 717000 + }, + { + "epoch": 0.22304532249866718, + "grad_norm": 12.986461639404297, + "learning_rate": 4.628257795835555e-05, + "loss": 3.1125, + "step": 717500 + }, + { + "epoch": 0.22320075477915405, + "grad_norm": 8.088714599609375, + "learning_rate": 4.6279987420347434e-05, + "loss": 3.067, + "step": 718000 + }, + { + "epoch": 0.22335618705964091, + "grad_norm": 7.987882137298584, + "learning_rate": 4.627739688233932e-05, + "loss": 3.1123, + "step": 718500 + }, + { + "epoch": 0.2235116193401278, + "grad_norm": 41.16291046142578, + "learning_rate": 4.627480634433121e-05, + "loss": 3.0888, + "step": 719000 + }, + { + "epoch": 0.22366705162061468, + "grad_norm": 5.462239742279053, + "learning_rate": 4.6272215806323095e-05, + "loss": 3.0761, + "step": 719500 + }, + { + "epoch": 0.22382248390110154, + "grad_norm": 7.6202168464660645, + "learning_rate": 4.6269625268314976e-05, + "loss": 3.133, + "step": 720000 + }, + { + "epoch": 0.22397791618158844, + "grad_norm": 7.56167459487915, + "learning_rate": 4.626703473030686e-05, + "loss": 3.0841, + "step": 720500 + }, + { + "epoch": 0.2241333484620753, + "grad_norm": 8.476016998291016, + "learning_rate": 4.626444419229874e-05, + "loss": 3.103, + "step": 721000 + }, + { + "epoch": 0.22428878074256217, + "grad_norm": 7.34256649017334, + "learning_rate": 4.626185365429063e-05, + "loss": 3.1549, + "step": 721500 + }, + { + "epoch": 0.22444421302304907, + "grad_norm": 13.673971176147461, + "learning_rate": 4.625926311628252e-05, + "loss": 3.0961, + "step": 722000 + }, + { + "epoch": 0.22459964530353593, + "grad_norm": 8.013544082641602, + "learning_rate": 4.6256672578274405e-05, + "loss": 3.0867, + "step": 722500 + }, + { + "epoch": 0.2247550775840228, + "grad_norm": 13.6668119430542, + "learning_rate": 4.625408204026629e-05, + "loss": 3.0568, + "step": 723000 + }, + { + "epoch": 0.2249105098645097, + "grad_norm": 34.72869873046875, + "learning_rate": 4.625149150225818e-05, + "loss": 3.062, + "step": 723500 + }, + { + "epoch": 0.22506594214499656, + "grad_norm": 7.199549198150635, + "learning_rate": 4.624890096425006e-05, + "loss": 3.0606, + "step": 724000 + }, + { + "epoch": 0.22522137442548343, + "grad_norm": 7.518344402313232, + "learning_rate": 4.6246310426241947e-05, + "loss": 3.0703, + "step": 724500 + }, + { + "epoch": 0.22537680670597032, + "grad_norm": 6.298249244689941, + "learning_rate": 4.6243719888233834e-05, + "loss": 3.0709, + "step": 725000 + }, + { + "epoch": 0.2255322389864572, + "grad_norm": 7.979647636413574, + "learning_rate": 4.6241129350225714e-05, + "loss": 3.0926, + "step": 725500 + }, + { + "epoch": 0.22568767126694406, + "grad_norm": 6.818674564361572, + "learning_rate": 4.62385388122176e-05, + "loss": 3.097, + "step": 726000 + }, + { + "epoch": 0.22584310354743095, + "grad_norm": 9.785653114318848, + "learning_rate": 4.623594827420948e-05, + "loss": 3.0935, + "step": 726500 + }, + { + "epoch": 0.22599853582791782, + "grad_norm": 6.899860382080078, + "learning_rate": 4.623335773620137e-05, + "loss": 3.1125, + "step": 727000 + }, + { + "epoch": 0.22615396810840468, + "grad_norm": 12.097755432128906, + "learning_rate": 4.6230767198193256e-05, + "loss": 3.1368, + "step": 727500 + }, + { + "epoch": 0.22630940038889158, + "grad_norm": 8.575678825378418, + "learning_rate": 4.622817666018514e-05, + "loss": 3.0899, + "step": 728000 + }, + { + "epoch": 0.22646483266937845, + "grad_norm": 4.693312168121338, + "learning_rate": 4.622558612217703e-05, + "loss": 3.0866, + "step": 728500 + }, + { + "epoch": 0.2266202649498653, + "grad_norm": 9.422880172729492, + "learning_rate": 4.622299558416892e-05, + "loss": 3.0397, + "step": 729000 + }, + { + "epoch": 0.2267756972303522, + "grad_norm": 7.295072078704834, + "learning_rate": 4.62204050461608e-05, + "loss": 3.0906, + "step": 729500 + }, + { + "epoch": 0.22693112951083907, + "grad_norm": 13.432278633117676, + "learning_rate": 4.6217814508152685e-05, + "loss": 3.0772, + "step": 730000 + }, + { + "epoch": 0.22708656179132594, + "grad_norm": 11.971627235412598, + "learning_rate": 4.621522397014457e-05, + "loss": 3.1067, + "step": 730500 + }, + { + "epoch": 0.2272419940718128, + "grad_norm": 7.930102825164795, + "learning_rate": 4.621263343213645e-05, + "loss": 3.0997, + "step": 731000 + }, + { + "epoch": 0.2273974263522997, + "grad_norm": 8.501309394836426, + "learning_rate": 4.621004289412834e-05, + "loss": 3.096, + "step": 731500 + }, + { + "epoch": 0.22755285863278657, + "grad_norm": 6.425371170043945, + "learning_rate": 4.620745235612022e-05, + "loss": 3.1133, + "step": 732000 + }, + { + "epoch": 0.22770829091327344, + "grad_norm": 6.512748718261719, + "learning_rate": 4.6204861818112114e-05, + "loss": 3.0736, + "step": 732500 + }, + { + "epoch": 0.22786372319376033, + "grad_norm": 7.411592960357666, + "learning_rate": 4.6202271280104e-05, + "loss": 3.0811, + "step": 733000 + }, + { + "epoch": 0.2280191554742472, + "grad_norm": 6.137753963470459, + "learning_rate": 4.619968074209588e-05, + "loss": 3.044, + "step": 733500 + }, + { + "epoch": 0.22817458775473407, + "grad_norm": 10.719345092773438, + "learning_rate": 4.619709020408777e-05, + "loss": 3.0885, + "step": 734000 + }, + { + "epoch": 0.22833002003522096, + "grad_norm": 10.439685821533203, + "learning_rate": 4.6194499666079656e-05, + "loss": 3.1288, + "step": 734500 + }, + { + "epoch": 0.22848545231570783, + "grad_norm": 8.270237922668457, + "learning_rate": 4.6191909128071536e-05, + "loss": 3.0988, + "step": 735000 + }, + { + "epoch": 0.2286408845961947, + "grad_norm": 8.632219314575195, + "learning_rate": 4.618931859006342e-05, + "loss": 3.1107, + "step": 735500 + }, + { + "epoch": 0.2287963168766816, + "grad_norm": 8.44904899597168, + "learning_rate": 4.618672805205531e-05, + "loss": 3.0975, + "step": 736000 + }, + { + "epoch": 0.22895174915716845, + "grad_norm": 5.726816177368164, + "learning_rate": 4.618413751404719e-05, + "loss": 3.077, + "step": 736500 + }, + { + "epoch": 0.22910718143765532, + "grad_norm": 10.471217155456543, + "learning_rate": 4.618154697603908e-05, + "loss": 3.0709, + "step": 737000 + }, + { + "epoch": 0.22926261371814222, + "grad_norm": 7.350876808166504, + "learning_rate": 4.6178956438030965e-05, + "loss": 3.1102, + "step": 737500 + }, + { + "epoch": 0.22941804599862908, + "grad_norm": 9.269872665405273, + "learning_rate": 4.617636590002285e-05, + "loss": 3.0602, + "step": 738000 + }, + { + "epoch": 0.22957347827911595, + "grad_norm": 37.0672492980957, + "learning_rate": 4.617377536201474e-05, + "loss": 3.0943, + "step": 738500 + }, + { + "epoch": 0.22972891055960284, + "grad_norm": 6.7067766189575195, + "learning_rate": 4.617118482400662e-05, + "loss": 3.1012, + "step": 739000 + }, + { + "epoch": 0.2298843428400897, + "grad_norm": 11.961145401000977, + "learning_rate": 4.616859428599851e-05, + "loss": 3.1034, + "step": 739500 + }, + { + "epoch": 0.23003977512057658, + "grad_norm": 7.558399677276611, + "learning_rate": 4.6166003747990394e-05, + "loss": 3.1138, + "step": 740000 + }, + { + "epoch": 0.23019520740106347, + "grad_norm": 8.533141136169434, + "learning_rate": 4.6163413209982274e-05, + "loss": 3.1008, + "step": 740500 + }, + { + "epoch": 0.23035063968155034, + "grad_norm": 5.811446189880371, + "learning_rate": 4.616082267197416e-05, + "loss": 3.0634, + "step": 741000 + }, + { + "epoch": 0.2305060719620372, + "grad_norm": 6.367056846618652, + "learning_rate": 4.615823213396605e-05, + "loss": 3.088, + "step": 741500 + }, + { + "epoch": 0.2306615042425241, + "grad_norm": 8.362862586975098, + "learning_rate": 4.615564159595793e-05, + "loss": 3.1109, + "step": 742000 + }, + { + "epoch": 0.23081693652301097, + "grad_norm": 8.59807300567627, + "learning_rate": 4.615305105794982e-05, + "loss": 3.1312, + "step": 742500 + }, + { + "epoch": 0.23097236880349784, + "grad_norm": 5.033411979675293, + "learning_rate": 4.615046051994171e-05, + "loss": 3.0725, + "step": 743000 + }, + { + "epoch": 0.23112780108398473, + "grad_norm": 46.89460754394531, + "learning_rate": 4.614786998193359e-05, + "loss": 3.1295, + "step": 743500 + }, + { + "epoch": 0.2312832333644716, + "grad_norm": 7.6855597496032715, + "learning_rate": 4.614527944392548e-05, + "loss": 3.0885, + "step": 744000 + }, + { + "epoch": 0.23143866564495846, + "grad_norm": 6.905600070953369, + "learning_rate": 4.614268890591736e-05, + "loss": 3.0133, + "step": 744500 + }, + { + "epoch": 0.23159409792544536, + "grad_norm": 8.76069450378418, + "learning_rate": 4.6140098367909245e-05, + "loss": 3.0627, + "step": 745000 + }, + { + "epoch": 0.23174953020593222, + "grad_norm": 9.22522258758545, + "learning_rate": 4.613750782990113e-05, + "loss": 3.1031, + "step": 745500 + }, + { + "epoch": 0.2319049624864191, + "grad_norm": 7.306840419769287, + "learning_rate": 4.613491729189301e-05, + "loss": 3.0703, + "step": 746000 + }, + { + "epoch": 0.232060394766906, + "grad_norm": 7.032435417175293, + "learning_rate": 4.61323267538849e-05, + "loss": 3.0781, + "step": 746500 + }, + { + "epoch": 0.23221582704739285, + "grad_norm": 8.711956024169922, + "learning_rate": 4.612973621587679e-05, + "loss": 3.0666, + "step": 747000 + }, + { + "epoch": 0.23237125932787972, + "grad_norm": 9.099496841430664, + "learning_rate": 4.6127145677868674e-05, + "loss": 3.0862, + "step": 747500 + }, + { + "epoch": 0.23252669160836661, + "grad_norm": 8.438478469848633, + "learning_rate": 4.612455513986056e-05, + "loss": 3.0886, + "step": 748000 + }, + { + "epoch": 0.23268212388885348, + "grad_norm": 6.080206871032715, + "learning_rate": 4.612196460185245e-05, + "loss": 3.0611, + "step": 748500 + }, + { + "epoch": 0.23283755616934035, + "grad_norm": 8.093314170837402, + "learning_rate": 4.611937406384433e-05, + "loss": 3.0495, + "step": 749000 + }, + { + "epoch": 0.23299298844982724, + "grad_norm": 7.394125938415527, + "learning_rate": 4.6116783525836216e-05, + "loss": 3.0498, + "step": 749500 + }, + { + "epoch": 0.2331484207303141, + "grad_norm": 8.983776092529297, + "learning_rate": 4.6114192987828096e-05, + "loss": 3.0941, + "step": 750000 + }, + { + "epoch": 0.23330385301080098, + "grad_norm": 6.057470798492432, + "learning_rate": 4.6111602449819984e-05, + "loss": 3.1006, + "step": 750500 + }, + { + "epoch": 0.23345928529128787, + "grad_norm": 7.145923137664795, + "learning_rate": 4.610901191181187e-05, + "loss": 3.0953, + "step": 751000 + }, + { + "epoch": 0.23361471757177474, + "grad_norm": 4.479586124420166, + "learning_rate": 4.610642137380375e-05, + "loss": 3.0934, + "step": 751500 + }, + { + "epoch": 0.2337701498522616, + "grad_norm": 11.00376033782959, + "learning_rate": 4.610383083579564e-05, + "loss": 3.0963, + "step": 752000 + }, + { + "epoch": 0.2339255821327485, + "grad_norm": 6.019590377807617, + "learning_rate": 4.610124029778753e-05, + "loss": 3.1088, + "step": 752500 + }, + { + "epoch": 0.23408101441323537, + "grad_norm": 24.16320037841797, + "learning_rate": 4.609864975977941e-05, + "loss": 3.1024, + "step": 753000 + }, + { + "epoch": 0.23423644669372223, + "grad_norm": 7.693138122558594, + "learning_rate": 4.60960592217713e-05, + "loss": 3.0383, + "step": 753500 + }, + { + "epoch": 0.23439187897420913, + "grad_norm": 11.098812103271484, + "learning_rate": 4.609346868376319e-05, + "loss": 3.1031, + "step": 754000 + }, + { + "epoch": 0.234547311254696, + "grad_norm": 11.591191291809082, + "learning_rate": 4.609087814575507e-05, + "loss": 3.0833, + "step": 754500 + }, + { + "epoch": 0.23470274353518286, + "grad_norm": 9.023612976074219, + "learning_rate": 4.6088287607746954e-05, + "loss": 3.1105, + "step": 755000 + }, + { + "epoch": 0.23485817581566976, + "grad_norm": 5.388576984405518, + "learning_rate": 4.608569706973884e-05, + "loss": 3.1088, + "step": 755500 + }, + { + "epoch": 0.23501360809615662, + "grad_norm": 9.202703475952148, + "learning_rate": 4.608310653173072e-05, + "loss": 3.1258, + "step": 756000 + }, + { + "epoch": 0.2351690403766435, + "grad_norm": 6.8936238288879395, + "learning_rate": 4.608051599372261e-05, + "loss": 3.0832, + "step": 756500 + }, + { + "epoch": 0.23532447265713038, + "grad_norm": 8.659178733825684, + "learning_rate": 4.6077925455714496e-05, + "loss": 3.1033, + "step": 757000 + }, + { + "epoch": 0.23547990493761725, + "grad_norm": 11.360255241394043, + "learning_rate": 4.6075334917706383e-05, + "loss": 3.0603, + "step": 757500 + }, + { + "epoch": 0.23563533721810412, + "grad_norm": 7.196190357208252, + "learning_rate": 4.607274437969827e-05, + "loss": 3.0882, + "step": 758000 + }, + { + "epoch": 0.235790769498591, + "grad_norm": 8.46656322479248, + "learning_rate": 4.607015384169015e-05, + "loss": 3.1389, + "step": 758500 + }, + { + "epoch": 0.23594620177907788, + "grad_norm": 9.156730651855469, + "learning_rate": 4.606756330368204e-05, + "loss": 3.0407, + "step": 759000 + }, + { + "epoch": 0.23610163405956475, + "grad_norm": 8.194433212280273, + "learning_rate": 4.6064972765673925e-05, + "loss": 3.1222, + "step": 759500 + }, + { + "epoch": 0.23625706634005164, + "grad_norm": 19.269489288330078, + "learning_rate": 4.6062382227665806e-05, + "loss": 3.1123, + "step": 760000 + }, + { + "epoch": 0.2364124986205385, + "grad_norm": 8.110638618469238, + "learning_rate": 4.605979168965769e-05, + "loss": 3.0716, + "step": 760500 + }, + { + "epoch": 0.23656793090102538, + "grad_norm": 9.342367172241211, + "learning_rate": 4.605720115164958e-05, + "loss": 3.0674, + "step": 761000 + }, + { + "epoch": 0.23672336318151227, + "grad_norm": 8.931631088256836, + "learning_rate": 4.605461061364146e-05, + "loss": 3.0827, + "step": 761500 + }, + { + "epoch": 0.23687879546199914, + "grad_norm": 12.819743156433105, + "learning_rate": 4.605202007563335e-05, + "loss": 3.0877, + "step": 762000 + }, + { + "epoch": 0.237034227742486, + "grad_norm": 10.268691062927246, + "learning_rate": 4.6049429537625235e-05, + "loss": 3.1465, + "step": 762500 + }, + { + "epoch": 0.2371896600229729, + "grad_norm": 6.477525234222412, + "learning_rate": 4.604683899961712e-05, + "loss": 3.0871, + "step": 763000 + }, + { + "epoch": 0.23734509230345976, + "grad_norm": 7.449978828430176, + "learning_rate": 4.604424846160901e-05, + "loss": 3.1067, + "step": 763500 + }, + { + "epoch": 0.23750052458394663, + "grad_norm": 6.99022912979126, + "learning_rate": 4.604165792360089e-05, + "loss": 3.0741, + "step": 764000 + }, + { + "epoch": 0.23765595686443353, + "grad_norm": 7.498544692993164, + "learning_rate": 4.6039067385592776e-05, + "loss": 3.1223, + "step": 764500 + }, + { + "epoch": 0.2378113891449204, + "grad_norm": 6.974576473236084, + "learning_rate": 4.6036476847584664e-05, + "loss": 3.0737, + "step": 765000 + }, + { + "epoch": 0.23796682142540726, + "grad_norm": 22.287857055664062, + "learning_rate": 4.6033886309576544e-05, + "loss": 3.1135, + "step": 765500 + }, + { + "epoch": 0.23812225370589415, + "grad_norm": 8.878180503845215, + "learning_rate": 4.603129577156843e-05, + "loss": 3.1204, + "step": 766000 + }, + { + "epoch": 0.23827768598638102, + "grad_norm": 6.967007637023926, + "learning_rate": 4.602870523356032e-05, + "loss": 3.0994, + "step": 766500 + }, + { + "epoch": 0.2384331182668679, + "grad_norm": 8.829821586608887, + "learning_rate": 4.6026114695552205e-05, + "loss": 3.0362, + "step": 767000 + }, + { + "epoch": 0.23858855054735478, + "grad_norm": 10.731303215026855, + "learning_rate": 4.602352415754409e-05, + "loss": 3.1092, + "step": 767500 + }, + { + "epoch": 0.23874398282784165, + "grad_norm": 16.696701049804688, + "learning_rate": 4.602093361953597e-05, + "loss": 3.0954, + "step": 768000 + }, + { + "epoch": 0.23889941510832852, + "grad_norm": 7.3082966804504395, + "learning_rate": 4.601834308152786e-05, + "loss": 3.072, + "step": 768500 + }, + { + "epoch": 0.2390548473888154, + "grad_norm": 7.238916397094727, + "learning_rate": 4.601575254351975e-05, + "loss": 3.0555, + "step": 769000 + }, + { + "epoch": 0.23921027966930228, + "grad_norm": 7.146639347076416, + "learning_rate": 4.601316200551163e-05, + "loss": 3.0721, + "step": 769500 + }, + { + "epoch": 0.23936571194978915, + "grad_norm": 11.385066986083984, + "learning_rate": 4.6010571467503515e-05, + "loss": 3.1186, + "step": 770000 + }, + { + "epoch": 0.23952114423027604, + "grad_norm": 8.263362884521484, + "learning_rate": 4.60079809294954e-05, + "loss": 3.1383, + "step": 770500 + }, + { + "epoch": 0.2396765765107629, + "grad_norm": 7.40037727355957, + "learning_rate": 4.600539039148728e-05, + "loss": 3.0873, + "step": 771000 + }, + { + "epoch": 0.23983200879124977, + "grad_norm": 11.687171936035156, + "learning_rate": 4.600279985347917e-05, + "loss": 3.0436, + "step": 771500 + }, + { + "epoch": 0.23998744107173667, + "grad_norm": 7.0067667961120605, + "learning_rate": 4.600020931547106e-05, + "loss": 3.1053, + "step": 772000 + }, + { + "epoch": 0.24014287335222353, + "grad_norm": 7.72511625289917, + "learning_rate": 4.5997618777462944e-05, + "loss": 3.0516, + "step": 772500 + }, + { + "epoch": 0.2402983056327104, + "grad_norm": 6.685989856719971, + "learning_rate": 4.599502823945483e-05, + "loss": 3.1195, + "step": 773000 + }, + { + "epoch": 0.2404537379131973, + "grad_norm": 9.453737258911133, + "learning_rate": 4.599243770144672e-05, + "loss": 3.1017, + "step": 773500 + }, + { + "epoch": 0.24060917019368416, + "grad_norm": 33.3604621887207, + "learning_rate": 4.59898471634386e-05, + "loss": 3.0627, + "step": 774000 + }, + { + "epoch": 0.24076460247417103, + "grad_norm": 7.805734157562256, + "learning_rate": 4.5987256625430486e-05, + "loss": 3.0933, + "step": 774500 + }, + { + "epoch": 0.24092003475465792, + "grad_norm": 7.0559234619140625, + "learning_rate": 4.5984666087422366e-05, + "loss": 3.0448, + "step": 775000 + }, + { + "epoch": 0.2410754670351448, + "grad_norm": 7.306061744689941, + "learning_rate": 4.598207554941425e-05, + "loss": 3.0812, + "step": 775500 + }, + { + "epoch": 0.24123089931563166, + "grad_norm": 7.221325874328613, + "learning_rate": 4.597948501140614e-05, + "loss": 3.0157, + "step": 776000 + }, + { + "epoch": 0.24138633159611855, + "grad_norm": 8.129195213317871, + "learning_rate": 4.597689447339803e-05, + "loss": 3.0958, + "step": 776500 + }, + { + "epoch": 0.24154176387660542, + "grad_norm": 8.0747652053833, + "learning_rate": 4.5974303935389915e-05, + "loss": 3.0829, + "step": 777000 + }, + { + "epoch": 0.2416971961570923, + "grad_norm": 7.153537750244141, + "learning_rate": 4.59717133973818e-05, + "loss": 3.0951, + "step": 777500 + }, + { + "epoch": 0.24185262843757918, + "grad_norm": 6.281228065490723, + "learning_rate": 4.596912285937368e-05, + "loss": 3.0567, + "step": 778000 + }, + { + "epoch": 0.24200806071806605, + "grad_norm": 11.51307201385498, + "learning_rate": 4.596653232136557e-05, + "loss": 3.0797, + "step": 778500 + }, + { + "epoch": 0.24216349299855291, + "grad_norm": 12.699590682983398, + "learning_rate": 4.5963941783357457e-05, + "loss": 3.0642, + "step": 779000 + }, + { + "epoch": 0.2423189252790398, + "grad_norm": 10.028389930725098, + "learning_rate": 4.596135124534934e-05, + "loss": 3.0679, + "step": 779500 + }, + { + "epoch": 0.24247435755952668, + "grad_norm": 7.499639511108398, + "learning_rate": 4.5958760707341224e-05, + "loss": 3.0774, + "step": 780000 + }, + { + "epoch": 0.24262978984001354, + "grad_norm": 8.923588752746582, + "learning_rate": 4.5956170169333104e-05, + "loss": 3.0865, + "step": 780500 + }, + { + "epoch": 0.24278522212050044, + "grad_norm": 8.653757095336914, + "learning_rate": 4.595357963132499e-05, + "loss": 3.0387, + "step": 781000 + }, + { + "epoch": 0.2429406544009873, + "grad_norm": 8.455970764160156, + "learning_rate": 4.595098909331688e-05, + "loss": 3.0549, + "step": 781500 + }, + { + "epoch": 0.24309608668147417, + "grad_norm": 7.759093761444092, + "learning_rate": 4.5948398555308766e-05, + "loss": 3.0926, + "step": 782000 + }, + { + "epoch": 0.24325151896196107, + "grad_norm": 6.794904708862305, + "learning_rate": 4.594580801730065e-05, + "loss": 3.1103, + "step": 782500 + }, + { + "epoch": 0.24340695124244793, + "grad_norm": 7.545804023742676, + "learning_rate": 4.594321747929254e-05, + "loss": 3.0789, + "step": 783000 + }, + { + "epoch": 0.2435623835229348, + "grad_norm": 9.825024604797363, + "learning_rate": 4.594062694128442e-05, + "loss": 3.0974, + "step": 783500 + }, + { + "epoch": 0.2437178158034217, + "grad_norm": 8.220877647399902, + "learning_rate": 4.593803640327631e-05, + "loss": 3.0607, + "step": 784000 + }, + { + "epoch": 0.24387324808390856, + "grad_norm": 8.453303337097168, + "learning_rate": 4.5935445865268195e-05, + "loss": 3.1428, + "step": 784500 + }, + { + "epoch": 0.24402868036439543, + "grad_norm": 8.612641334533691, + "learning_rate": 4.5932855327260075e-05, + "loss": 3.0806, + "step": 785000 + }, + { + "epoch": 0.24418411264488232, + "grad_norm": 5.832128524780273, + "learning_rate": 4.593026478925196e-05, + "loss": 3.0632, + "step": 785500 + }, + { + "epoch": 0.2443395449253692, + "grad_norm": 9.408025741577148, + "learning_rate": 4.592767425124385e-05, + "loss": 3.0398, + "step": 786000 + }, + { + "epoch": 0.24449497720585606, + "grad_norm": 8.065176963806152, + "learning_rate": 4.592508371323574e-05, + "loss": 3.1141, + "step": 786500 + }, + { + "epoch": 0.24465040948634295, + "grad_norm": 8.8375244140625, + "learning_rate": 4.5922493175227624e-05, + "loss": 3.0783, + "step": 787000 + }, + { + "epoch": 0.24480584176682982, + "grad_norm": 7.037002086639404, + "learning_rate": 4.5919902637219504e-05, + "loss": 3.1029, + "step": 787500 + }, + { + "epoch": 0.24496127404731668, + "grad_norm": 10.039609909057617, + "learning_rate": 4.591731209921139e-05, + "loss": 3.0298, + "step": 788000 + }, + { + "epoch": 0.24511670632780358, + "grad_norm": 11.20390510559082, + "learning_rate": 4.591472156120328e-05, + "loss": 3.0364, + "step": 788500 + }, + { + "epoch": 0.24527213860829045, + "grad_norm": 6.551907062530518, + "learning_rate": 4.591213102319516e-05, + "loss": 3.0736, + "step": 789000 + }, + { + "epoch": 0.2454275708887773, + "grad_norm": 15.821664810180664, + "learning_rate": 4.5909540485187046e-05, + "loss": 3.0893, + "step": 789500 + }, + { + "epoch": 0.2455830031692642, + "grad_norm": 7.899175643920898, + "learning_rate": 4.590694994717893e-05, + "loss": 3.1105, + "step": 790000 + }, + { + "epoch": 0.24573843544975107, + "grad_norm": 9.004530906677246, + "learning_rate": 4.5904359409170814e-05, + "loss": 3.1336, + "step": 790500 + }, + { + "epoch": 0.24589386773023794, + "grad_norm": 7.367190837860107, + "learning_rate": 4.59017688711627e-05, + "loss": 3.0682, + "step": 791000 + }, + { + "epoch": 0.24604930001072484, + "grad_norm": 8.958883285522461, + "learning_rate": 4.589917833315459e-05, + "loss": 3.0906, + "step": 791500 + }, + { + "epoch": 0.2462047322912117, + "grad_norm": 6.753279685974121, + "learning_rate": 4.5896587795146475e-05, + "loss": 3.0452, + "step": 792000 + }, + { + "epoch": 0.24636016457169857, + "grad_norm": 8.749480247497559, + "learning_rate": 4.589399725713836e-05, + "loss": 3.0925, + "step": 792500 + }, + { + "epoch": 0.24651559685218546, + "grad_norm": 9.280773162841797, + "learning_rate": 4.589140671913024e-05, + "loss": 3.0998, + "step": 793000 + }, + { + "epoch": 0.24667102913267233, + "grad_norm": 8.068155288696289, + "learning_rate": 4.588881618112213e-05, + "loss": 3.0549, + "step": 793500 + }, + { + "epoch": 0.2468264614131592, + "grad_norm": 7.939708232879639, + "learning_rate": 4.588622564311402e-05, + "loss": 3.0697, + "step": 794000 + }, + { + "epoch": 0.2469818936936461, + "grad_norm": 9.784419059753418, + "learning_rate": 4.58836351051059e-05, + "loss": 3.0796, + "step": 794500 + }, + { + "epoch": 0.24713732597413296, + "grad_norm": 7.518660545349121, + "learning_rate": 4.5881044567097784e-05, + "loss": 3.0763, + "step": 795000 + }, + { + "epoch": 0.24729275825461983, + "grad_norm": 9.041620254516602, + "learning_rate": 4.587845402908967e-05, + "loss": 3.0691, + "step": 795500 + }, + { + "epoch": 0.24744819053510672, + "grad_norm": 9.127012252807617, + "learning_rate": 4.587586349108156e-05, + "loss": 3.0651, + "step": 796000 + }, + { + "epoch": 0.2476036228155936, + "grad_norm": 6.725668907165527, + "learning_rate": 4.5873272953073446e-05, + "loss": 3.087, + "step": 796500 + }, + { + "epoch": 0.24775905509608045, + "grad_norm": 10.7011137008667, + "learning_rate": 4.587068241506533e-05, + "loss": 3.0636, + "step": 797000 + }, + { + "epoch": 0.24791448737656735, + "grad_norm": 9.290550231933594, + "learning_rate": 4.5868091877057213e-05, + "loss": 3.0854, + "step": 797500 + }, + { + "epoch": 0.24806991965705422, + "grad_norm": 8.060050964355469, + "learning_rate": 4.58655013390491e-05, + "loss": 3.0464, + "step": 798000 + }, + { + "epoch": 0.24822535193754108, + "grad_norm": 8.86795425415039, + "learning_rate": 4.586291080104098e-05, + "loss": 3.0733, + "step": 798500 + }, + { + "epoch": 0.24838078421802798, + "grad_norm": 8.29450511932373, + "learning_rate": 4.586032026303287e-05, + "loss": 3.0908, + "step": 799000 + }, + { + "epoch": 0.24853621649851484, + "grad_norm": 7.859611988067627, + "learning_rate": 4.5857729725024755e-05, + "loss": 3.0833, + "step": 799500 + }, + { + "epoch": 0.2486916487790017, + "grad_norm": 10.578564643859863, + "learning_rate": 4.5855139187016636e-05, + "loss": 3.0643, + "step": 800000 + }, + { + "epoch": 0.2488470810594886, + "grad_norm": 4.823127269744873, + "learning_rate": 4.585254864900852e-05, + "loss": 3.1102, + "step": 800500 + }, + { + "epoch": 0.24900251333997547, + "grad_norm": 6.841705322265625, + "learning_rate": 4.584995811100041e-05, + "loss": 3.0475, + "step": 801000 + }, + { + "epoch": 0.24915794562046234, + "grad_norm": 12.844812393188477, + "learning_rate": 4.58473675729923e-05, + "loss": 3.0411, + "step": 801500 + }, + { + "epoch": 0.24931337790094923, + "grad_norm": 6.2624006271362305, + "learning_rate": 4.5844777034984184e-05, + "loss": 3.0547, + "step": 802000 + }, + { + "epoch": 0.2494688101814361, + "grad_norm": 12.680464744567871, + "learning_rate": 4.584218649697607e-05, + "loss": 3.0893, + "step": 802500 + }, + { + "epoch": 0.24962424246192297, + "grad_norm": 8.369400024414062, + "learning_rate": 4.583959595896795e-05, + "loss": 3.0892, + "step": 803000 + }, + { + "epoch": 0.24977967474240986, + "grad_norm": 15.258700370788574, + "learning_rate": 4.583700542095984e-05, + "loss": 3.0546, + "step": 803500 + }, + { + "epoch": 0.24993510702289673, + "grad_norm": 7.4785051345825195, + "learning_rate": 4.5834414882951726e-05, + "loss": 3.0155, + "step": 804000 + }, + { + "epoch": 0.2500905393033836, + "grad_norm": 6.235614776611328, + "learning_rate": 4.5831824344943606e-05, + "loss": 3.0474, + "step": 804500 + }, + { + "epoch": 0.25024597158387046, + "grad_norm": 5.924497604370117, + "learning_rate": 4.5829233806935494e-05, + "loss": 3.0736, + "step": 805000 + }, + { + "epoch": 0.25040140386435733, + "grad_norm": 11.702980041503906, + "learning_rate": 4.5826643268927374e-05, + "loss": 3.0695, + "step": 805500 + }, + { + "epoch": 0.25055683614484425, + "grad_norm": 8.195104598999023, + "learning_rate": 4.582405273091927e-05, + "loss": 3.0836, + "step": 806000 + }, + { + "epoch": 0.2507122684253311, + "grad_norm": 15.091687202453613, + "learning_rate": 4.5821462192911155e-05, + "loss": 3.0666, + "step": 806500 + }, + { + "epoch": 0.250867700705818, + "grad_norm": 9.206249237060547, + "learning_rate": 4.5818871654903035e-05, + "loss": 3.1066, + "step": 807000 + }, + { + "epoch": 0.25102313298630485, + "grad_norm": 9.643550872802734, + "learning_rate": 4.581628111689492e-05, + "loss": 3.0874, + "step": 807500 + }, + { + "epoch": 0.2511785652667917, + "grad_norm": 7.2160868644714355, + "learning_rate": 4.581369057888681e-05, + "loss": 3.0458, + "step": 808000 + }, + { + "epoch": 0.2513339975472786, + "grad_norm": 9.15868091583252, + "learning_rate": 4.581110004087869e-05, + "loss": 3.0892, + "step": 808500 + }, + { + "epoch": 0.2514894298277655, + "grad_norm": 7.212317943572998, + "learning_rate": 4.580850950287058e-05, + "loss": 3.0845, + "step": 809000 + }, + { + "epoch": 0.2516448621082524, + "grad_norm": 11.567200660705566, + "learning_rate": 4.5805918964862464e-05, + "loss": 3.1149, + "step": 809500 + }, + { + "epoch": 0.25180029438873924, + "grad_norm": 16.183364868164062, + "learning_rate": 4.5803328426854345e-05, + "loss": 3.0913, + "step": 810000 + }, + { + "epoch": 0.2519557266692261, + "grad_norm": 8.539515495300293, + "learning_rate": 4.580073788884623e-05, + "loss": 3.0706, + "step": 810500 + }, + { + "epoch": 0.252111158949713, + "grad_norm": 8.822562217712402, + "learning_rate": 4.579814735083812e-05, + "loss": 3.1236, + "step": 811000 + }, + { + "epoch": 0.25226659123019984, + "grad_norm": 6.0861968994140625, + "learning_rate": 4.5795556812830006e-05, + "loss": 3.0807, + "step": 811500 + }, + { + "epoch": 0.25242202351068677, + "grad_norm": 9.036038398742676, + "learning_rate": 4.5792966274821893e-05, + "loss": 3.1287, + "step": 812000 + }, + { + "epoch": 0.25257745579117363, + "grad_norm": 6.433790683746338, + "learning_rate": 4.5790375736813774e-05, + "loss": 3.1187, + "step": 812500 + }, + { + "epoch": 0.2527328880716605, + "grad_norm": 8.107160568237305, + "learning_rate": 4.578778519880566e-05, + "loss": 3.089, + "step": 813000 + }, + { + "epoch": 0.25288832035214737, + "grad_norm": 8.169885635375977, + "learning_rate": 4.578519466079755e-05, + "loss": 3.063, + "step": 813500 + }, + { + "epoch": 0.25304375263263423, + "grad_norm": 8.74156665802002, + "learning_rate": 4.578260412278943e-05, + "loss": 3.0932, + "step": 814000 + }, + { + "epoch": 0.2531991849131211, + "grad_norm": 58.9300422668457, + "learning_rate": 4.5780013584781316e-05, + "loss": 3.1039, + "step": 814500 + }, + { + "epoch": 0.253354617193608, + "grad_norm": 7.775647163391113, + "learning_rate": 4.57774230467732e-05, + "loss": 3.0665, + "step": 815000 + }, + { + "epoch": 0.2535100494740949, + "grad_norm": 6.758108615875244, + "learning_rate": 4.577483250876508e-05, + "loss": 3.0875, + "step": 815500 + }, + { + "epoch": 0.25366548175458176, + "grad_norm": 6.916531562805176, + "learning_rate": 4.577224197075698e-05, + "loss": 3.0457, + "step": 816000 + }, + { + "epoch": 0.2538209140350686, + "grad_norm": 7.347177505493164, + "learning_rate": 4.576965143274886e-05, + "loss": 3.077, + "step": 816500 + }, + { + "epoch": 0.2539763463155555, + "grad_norm": 10.063681602478027, + "learning_rate": 4.5767060894740745e-05, + "loss": 3.0585, + "step": 817000 + }, + { + "epoch": 0.25413177859604236, + "grad_norm": 7.989004135131836, + "learning_rate": 4.576447035673263e-05, + "loss": 3.0334, + "step": 817500 + }, + { + "epoch": 0.2542872108765293, + "grad_norm": 7.417759418487549, + "learning_rate": 4.576187981872451e-05, + "loss": 3.0475, + "step": 818000 + }, + { + "epoch": 0.25444264315701615, + "grad_norm": 8.995436668395996, + "learning_rate": 4.57592892807164e-05, + "loss": 3.0821, + "step": 818500 + }, + { + "epoch": 0.254598075437503, + "grad_norm": 7.082031726837158, + "learning_rate": 4.5756698742708286e-05, + "loss": 3.1101, + "step": 819000 + }, + { + "epoch": 0.2547535077179899, + "grad_norm": 7.7720746994018555, + "learning_rate": 4.575410820470017e-05, + "loss": 3.0467, + "step": 819500 + }, + { + "epoch": 0.25490893999847675, + "grad_norm": 8.333396911621094, + "learning_rate": 4.5751517666692054e-05, + "loss": 3.0403, + "step": 820000 + }, + { + "epoch": 0.2550643722789636, + "grad_norm": 7.610442161560059, + "learning_rate": 4.574892712868394e-05, + "loss": 3.0666, + "step": 820500 + }, + { + "epoch": 0.25521980455945054, + "grad_norm": 6.928641319274902, + "learning_rate": 4.574633659067583e-05, + "loss": 3.0632, + "step": 821000 + }, + { + "epoch": 0.2553752368399374, + "grad_norm": 7.581783771514893, + "learning_rate": 4.5743746052667715e-05, + "loss": 3.0813, + "step": 821500 + }, + { + "epoch": 0.25553066912042427, + "grad_norm": 25.528064727783203, + "learning_rate": 4.57411555146596e-05, + "loss": 3.0535, + "step": 822000 + }, + { + "epoch": 0.25568610140091114, + "grad_norm": 7.463978290557861, + "learning_rate": 4.573856497665148e-05, + "loss": 3.0811, + "step": 822500 + }, + { + "epoch": 0.255841533681398, + "grad_norm": 8.877655029296875, + "learning_rate": 4.573597443864337e-05, + "loss": 3.0755, + "step": 823000 + }, + { + "epoch": 0.25599696596188487, + "grad_norm": 8.192237854003906, + "learning_rate": 4.573338390063525e-05, + "loss": 3.0822, + "step": 823500 + }, + { + "epoch": 0.2561523982423718, + "grad_norm": 6.0090250968933105, + "learning_rate": 4.573079336262714e-05, + "loss": 3.1325, + "step": 824000 + }, + { + "epoch": 0.25630783052285866, + "grad_norm": 9.812878608703613, + "learning_rate": 4.5728202824619025e-05, + "loss": 3.0645, + "step": 824500 + }, + { + "epoch": 0.2564632628033455, + "grad_norm": 9.237828254699707, + "learning_rate": 4.5725612286610905e-05, + "loss": 3.0983, + "step": 825000 + }, + { + "epoch": 0.2566186950838324, + "grad_norm": 7.595101833343506, + "learning_rate": 4.572302174860279e-05, + "loss": 3.0982, + "step": 825500 + }, + { + "epoch": 0.25677412736431926, + "grad_norm": 7.703747272491455, + "learning_rate": 4.5720431210594686e-05, + "loss": 3.0527, + "step": 826000 + }, + { + "epoch": 0.2569295596448061, + "grad_norm": 9.47680950164795, + "learning_rate": 4.571784067258657e-05, + "loss": 3.0872, + "step": 826500 + }, + { + "epoch": 0.25708499192529305, + "grad_norm": 8.377710342407227, + "learning_rate": 4.5715250134578454e-05, + "loss": 3.0534, + "step": 827000 + }, + { + "epoch": 0.2572404242057799, + "grad_norm": 9.572712898254395, + "learning_rate": 4.571265959657034e-05, + "loss": 3.1003, + "step": 827500 + }, + { + "epoch": 0.2573958564862668, + "grad_norm": 10.732000350952148, + "learning_rate": 4.571006905856222e-05, + "loss": 3.0806, + "step": 828000 + }, + { + "epoch": 0.25755128876675365, + "grad_norm": 11.661592483520508, + "learning_rate": 4.570747852055411e-05, + "loss": 3.0786, + "step": 828500 + }, + { + "epoch": 0.2577067210472405, + "grad_norm": 10.141729354858398, + "learning_rate": 4.570488798254599e-05, + "loss": 3.0927, + "step": 829000 + }, + { + "epoch": 0.2578621533277274, + "grad_norm": 7.447903156280518, + "learning_rate": 4.5702297444537876e-05, + "loss": 3.0869, + "step": 829500 + }, + { + "epoch": 0.2580175856082143, + "grad_norm": 8.864701271057129, + "learning_rate": 4.569970690652976e-05, + "loss": 3.07, + "step": 830000 + }, + { + "epoch": 0.2581730178887012, + "grad_norm": 8.48563003540039, + "learning_rate": 4.569711636852165e-05, + "loss": 3.0374, + "step": 830500 + }, + { + "epoch": 0.25832845016918804, + "grad_norm": 6.521554470062256, + "learning_rate": 4.569452583051354e-05, + "loss": 3.0445, + "step": 831000 + }, + { + "epoch": 0.2584838824496749, + "grad_norm": 11.629188537597656, + "learning_rate": 4.5691935292505425e-05, + "loss": 3.0745, + "step": 831500 + }, + { + "epoch": 0.2586393147301618, + "grad_norm": 16.49677276611328, + "learning_rate": 4.5689344754497305e-05, + "loss": 3.0465, + "step": 832000 + }, + { + "epoch": 0.25879474701064864, + "grad_norm": 6.534369945526123, + "learning_rate": 4.568675421648919e-05, + "loss": 3.0935, + "step": 832500 + }, + { + "epoch": 0.25895017929113556, + "grad_norm": 10.5376615524292, + "learning_rate": 4.568416367848108e-05, + "loss": 3.054, + "step": 833000 + }, + { + "epoch": 0.25910561157162243, + "grad_norm": 8.996853828430176, + "learning_rate": 4.568157314047296e-05, + "loss": 3.071, + "step": 833500 + }, + { + "epoch": 0.2592610438521093, + "grad_norm": 13.520620346069336, + "learning_rate": 4.567898260246485e-05, + "loss": 3.0711, + "step": 834000 + }, + { + "epoch": 0.25941647613259616, + "grad_norm": 8.787803649902344, + "learning_rate": 4.567639206445673e-05, + "loss": 3.0778, + "step": 834500 + }, + { + "epoch": 0.25957190841308303, + "grad_norm": 8.094776153564453, + "learning_rate": 4.5673801526448614e-05, + "loss": 3.0728, + "step": 835000 + }, + { + "epoch": 0.2597273406935699, + "grad_norm": 8.381134033203125, + "learning_rate": 4.56712109884405e-05, + "loss": 3.1257, + "step": 835500 + }, + { + "epoch": 0.2598827729740568, + "grad_norm": 10.163546562194824, + "learning_rate": 4.566862045043239e-05, + "loss": 3.0935, + "step": 836000 + }, + { + "epoch": 0.2600382052545437, + "grad_norm": 6.754650115966797, + "learning_rate": 4.5666029912424276e-05, + "loss": 3.0704, + "step": 836500 + }, + { + "epoch": 0.26019363753503055, + "grad_norm": 9.589583396911621, + "learning_rate": 4.566343937441616e-05, + "loss": 3.0861, + "step": 837000 + }, + { + "epoch": 0.2603490698155174, + "grad_norm": 6.849269866943359, + "learning_rate": 4.566084883640804e-05, + "loss": 3.0933, + "step": 837500 + }, + { + "epoch": 0.2605045020960043, + "grad_norm": 8.291600227355957, + "learning_rate": 4.565825829839993e-05, + "loss": 3.0602, + "step": 838000 + }, + { + "epoch": 0.26065993437649115, + "grad_norm": 34.74884796142578, + "learning_rate": 4.565566776039182e-05, + "loss": 3.0826, + "step": 838500 + }, + { + "epoch": 0.2608153666569781, + "grad_norm": 5.338313579559326, + "learning_rate": 4.56530772223837e-05, + "loss": 3.0746, + "step": 839000 + }, + { + "epoch": 0.26097079893746494, + "grad_norm": 7.494378089904785, + "learning_rate": 4.5650486684375585e-05, + "loss": 3.0206, + "step": 839500 + }, + { + "epoch": 0.2611262312179518, + "grad_norm": 7.749965190887451, + "learning_rate": 4.564789614636747e-05, + "loss": 3.0728, + "step": 840000 + }, + { + "epoch": 0.2612816634984387, + "grad_norm": 6.900223731994629, + "learning_rate": 4.564530560835936e-05, + "loss": 3.07, + "step": 840500 + }, + { + "epoch": 0.26143709577892554, + "grad_norm": 7.898982048034668, + "learning_rate": 4.564271507035125e-05, + "loss": 3.0795, + "step": 841000 + }, + { + "epoch": 0.2615925280594124, + "grad_norm": 7.938537120819092, + "learning_rate": 4.564012453234313e-05, + "loss": 3.1133, + "step": 841500 + }, + { + "epoch": 0.26174796033989933, + "grad_norm": 9.846366882324219, + "learning_rate": 4.5637533994335014e-05, + "loss": 3.0889, + "step": 842000 + }, + { + "epoch": 0.2619033926203862, + "grad_norm": 8.784650802612305, + "learning_rate": 4.56349434563269e-05, + "loss": 3.0646, + "step": 842500 + }, + { + "epoch": 0.26205882490087307, + "grad_norm": 8.902210235595703, + "learning_rate": 4.563235291831878e-05, + "loss": 3.0794, + "step": 843000 + }, + { + "epoch": 0.26221425718135993, + "grad_norm": 13.720705032348633, + "learning_rate": 4.562976238031067e-05, + "loss": 3.1047, + "step": 843500 + }, + { + "epoch": 0.2623696894618468, + "grad_norm": 5.722543716430664, + "learning_rate": 4.5627171842302556e-05, + "loss": 3.016, + "step": 844000 + }, + { + "epoch": 0.26252512174233367, + "grad_norm": 6.443589210510254, + "learning_rate": 4.5624581304294436e-05, + "loss": 3.059, + "step": 844500 + }, + { + "epoch": 0.2626805540228206, + "grad_norm": 8.519850730895996, + "learning_rate": 4.5621990766286324e-05, + "loss": 3.0629, + "step": 845000 + }, + { + "epoch": 0.26283598630330746, + "grad_norm": 15.56200885772705, + "learning_rate": 4.561940022827821e-05, + "loss": 3.0677, + "step": 845500 + }, + { + "epoch": 0.2629914185837943, + "grad_norm": 8.087678909301758, + "learning_rate": 4.56168096902701e-05, + "loss": 3.064, + "step": 846000 + }, + { + "epoch": 0.2631468508642812, + "grad_norm": 7.885746002197266, + "learning_rate": 4.5614219152261985e-05, + "loss": 3.0619, + "step": 846500 + }, + { + "epoch": 0.26330228314476806, + "grad_norm": 10.875360488891602, + "learning_rate": 4.5611628614253865e-05, + "loss": 3.0753, + "step": 847000 + }, + { + "epoch": 0.2634577154252549, + "grad_norm": 7.420193195343018, + "learning_rate": 4.560903807624575e-05, + "loss": 3.092, + "step": 847500 + }, + { + "epoch": 0.26361314770574185, + "grad_norm": 9.11772632598877, + "learning_rate": 4.560644753823764e-05, + "loss": 3.0839, + "step": 848000 + }, + { + "epoch": 0.2637685799862287, + "grad_norm": 5.76719331741333, + "learning_rate": 4.560385700022952e-05, + "loss": 3.0768, + "step": 848500 + }, + { + "epoch": 0.2639240122667156, + "grad_norm": 7.524770259857178, + "learning_rate": 4.560126646222141e-05, + "loss": 3.093, + "step": 849000 + }, + { + "epoch": 0.26407944454720245, + "grad_norm": 7.620481491088867, + "learning_rate": 4.5598675924213294e-05, + "loss": 3.0943, + "step": 849500 + }, + { + "epoch": 0.2642348768276893, + "grad_norm": 10.314866065979004, + "learning_rate": 4.559608538620518e-05, + "loss": 3.0846, + "step": 850000 + }, + { + "epoch": 0.2643903091081762, + "grad_norm": 7.17992639541626, + "learning_rate": 4.559349484819707e-05, + "loss": 3.0127, + "step": 850500 + }, + { + "epoch": 0.2645457413886631, + "grad_norm": 6.873992443084717, + "learning_rate": 4.5590904310188956e-05, + "loss": 3.0636, + "step": 851000 + }, + { + "epoch": 0.26470117366914997, + "grad_norm": 6.822232246398926, + "learning_rate": 4.5588313772180836e-05, + "loss": 3.0862, + "step": 851500 + }, + { + "epoch": 0.26485660594963684, + "grad_norm": 8.74211311340332, + "learning_rate": 4.5585723234172723e-05, + "loss": 3.0792, + "step": 852000 + }, + { + "epoch": 0.2650120382301237, + "grad_norm": 8.07431697845459, + "learning_rate": 4.5583132696164604e-05, + "loss": 3.0927, + "step": 852500 + }, + { + "epoch": 0.26516747051061057, + "grad_norm": 8.063812255859375, + "learning_rate": 4.558054215815649e-05, + "loss": 3.0735, + "step": 853000 + }, + { + "epoch": 0.26532290279109744, + "grad_norm": 14.139162063598633, + "learning_rate": 4.557795162014838e-05, + "loss": 3.1366, + "step": 853500 + }, + { + "epoch": 0.26547833507158436, + "grad_norm": 8.528549194335938, + "learning_rate": 4.557536108214026e-05, + "loss": 3.0482, + "step": 854000 + }, + { + "epoch": 0.2656337673520712, + "grad_norm": 8.824786186218262, + "learning_rate": 4.5572770544132146e-05, + "loss": 3.0891, + "step": 854500 + }, + { + "epoch": 0.2657891996325581, + "grad_norm": 9.012481689453125, + "learning_rate": 4.557018000612403e-05, + "loss": 3.067, + "step": 855000 + }, + { + "epoch": 0.26594463191304496, + "grad_norm": 7.583472728729248, + "learning_rate": 4.556758946811592e-05, + "loss": 3.0492, + "step": 855500 + }, + { + "epoch": 0.2661000641935318, + "grad_norm": 7.528377532958984, + "learning_rate": 4.556499893010781e-05, + "loss": 3.0418, + "step": 856000 + }, + { + "epoch": 0.2662554964740187, + "grad_norm": 6.979521751403809, + "learning_rate": 4.5562408392099694e-05, + "loss": 3.0274, + "step": 856500 + }, + { + "epoch": 0.2664109287545056, + "grad_norm": 23.85808753967285, + "learning_rate": 4.5559817854091575e-05, + "loss": 3.074, + "step": 857000 + }, + { + "epoch": 0.2665663610349925, + "grad_norm": 10.512228012084961, + "learning_rate": 4.555722731608346e-05, + "loss": 3.0833, + "step": 857500 + }, + { + "epoch": 0.26672179331547935, + "grad_norm": 13.244112968444824, + "learning_rate": 4.555463677807535e-05, + "loss": 3.0573, + "step": 858000 + }, + { + "epoch": 0.2668772255959662, + "grad_norm": 7.158172607421875, + "learning_rate": 4.555204624006723e-05, + "loss": 3.14, + "step": 858500 + }, + { + "epoch": 0.2670326578764531, + "grad_norm": 9.18485164642334, + "learning_rate": 4.5549455702059116e-05, + "loss": 3.0924, + "step": 859000 + }, + { + "epoch": 0.26718809015693995, + "grad_norm": 21.611175537109375, + "learning_rate": 4.5546865164051004e-05, + "loss": 3.0476, + "step": 859500 + }, + { + "epoch": 0.2673435224374269, + "grad_norm": 8.571802139282227, + "learning_rate": 4.554427462604289e-05, + "loss": 3.0875, + "step": 860000 + }, + { + "epoch": 0.26749895471791374, + "grad_norm": 6.166088104248047, + "learning_rate": 4.554168408803478e-05, + "loss": 3.0398, + "step": 860500 + }, + { + "epoch": 0.2676543869984006, + "grad_norm": 11.596806526184082, + "learning_rate": 4.553909355002666e-05, + "loss": 3.0546, + "step": 861000 + }, + { + "epoch": 0.2678098192788875, + "grad_norm": 8.50732707977295, + "learning_rate": 4.5536503012018545e-05, + "loss": 3.0728, + "step": 861500 + }, + { + "epoch": 0.26796525155937434, + "grad_norm": 7.266512393951416, + "learning_rate": 4.553391247401043e-05, + "loss": 3.0965, + "step": 862000 + }, + { + "epoch": 0.2681206838398612, + "grad_norm": 8.730534553527832, + "learning_rate": 4.553132193600231e-05, + "loss": 3.075, + "step": 862500 + }, + { + "epoch": 0.26827611612034813, + "grad_norm": 24.719507217407227, + "learning_rate": 4.55287313979942e-05, + "loss": 3.0637, + "step": 863000 + }, + { + "epoch": 0.268431548400835, + "grad_norm": 9.35635757446289, + "learning_rate": 4.552614085998609e-05, + "loss": 3.0428, + "step": 863500 + }, + { + "epoch": 0.26858698068132186, + "grad_norm": 9.340043067932129, + "learning_rate": 4.552355032197797e-05, + "loss": 3.0936, + "step": 864000 + }, + { + "epoch": 0.26874241296180873, + "grad_norm": 8.406676292419434, + "learning_rate": 4.5520959783969855e-05, + "loss": 3.0571, + "step": 864500 + }, + { + "epoch": 0.2688978452422956, + "grad_norm": 12.941739082336426, + "learning_rate": 4.551836924596174e-05, + "loss": 3.0912, + "step": 865000 + }, + { + "epoch": 0.26905327752278246, + "grad_norm": 8.998320579528809, + "learning_rate": 4.551577870795363e-05, + "loss": 3.0745, + "step": 865500 + }, + { + "epoch": 0.2692087098032694, + "grad_norm": 7.413263320922852, + "learning_rate": 4.5513188169945516e-05, + "loss": 3.1085, + "step": 866000 + }, + { + "epoch": 0.26936414208375625, + "grad_norm": 8.217413902282715, + "learning_rate": 4.55105976319374e-05, + "loss": 3.0983, + "step": 866500 + }, + { + "epoch": 0.2695195743642431, + "grad_norm": 9.074705123901367, + "learning_rate": 4.5508007093929284e-05, + "loss": 3.0695, + "step": 867000 + }, + { + "epoch": 0.26967500664473, + "grad_norm": 8.141487121582031, + "learning_rate": 4.550541655592117e-05, + "loss": 3.0711, + "step": 867500 + }, + { + "epoch": 0.26983043892521685, + "grad_norm": 6.369366645812988, + "learning_rate": 4.550282601791305e-05, + "loss": 3.0581, + "step": 868000 + }, + { + "epoch": 0.2699858712057037, + "grad_norm": 7.522904872894287, + "learning_rate": 4.550023547990494e-05, + "loss": 3.0782, + "step": 868500 + }, + { + "epoch": 0.27014130348619064, + "grad_norm": 7.84541654586792, + "learning_rate": 4.5497644941896826e-05, + "loss": 3.0862, + "step": 869000 + }, + { + "epoch": 0.2702967357666775, + "grad_norm": 12.67497444152832, + "learning_rate": 4.549505440388871e-05, + "loss": 3.0813, + "step": 869500 + }, + { + "epoch": 0.2704521680471644, + "grad_norm": 7.390239715576172, + "learning_rate": 4.54924638658806e-05, + "loss": 3.0726, + "step": 870000 + }, + { + "epoch": 0.27060760032765124, + "grad_norm": 11.832084655761719, + "learning_rate": 4.548987332787248e-05, + "loss": 3.0416, + "step": 870500 + }, + { + "epoch": 0.2707630326081381, + "grad_norm": 6.58420991897583, + "learning_rate": 4.548728278986437e-05, + "loss": 3.0839, + "step": 871000 + }, + { + "epoch": 0.270918464888625, + "grad_norm": 7.08160400390625, + "learning_rate": 4.5484692251856255e-05, + "loss": 3.0698, + "step": 871500 + }, + { + "epoch": 0.2710738971691119, + "grad_norm": 9.669782638549805, + "learning_rate": 4.5482101713848135e-05, + "loss": 3.0603, + "step": 872000 + }, + { + "epoch": 0.27122932944959877, + "grad_norm": 11.937777519226074, + "learning_rate": 4.547951117584002e-05, + "loss": 3.073, + "step": 872500 + }, + { + "epoch": 0.27138476173008563, + "grad_norm": 7.504678249359131, + "learning_rate": 4.547692063783191e-05, + "loss": 3.0938, + "step": 873000 + }, + { + "epoch": 0.2715401940105725, + "grad_norm": 7.165278911590576, + "learning_rate": 4.547433009982379e-05, + "loss": 3.0422, + "step": 873500 + }, + { + "epoch": 0.27169562629105937, + "grad_norm": 6.944035530090332, + "learning_rate": 4.547173956181568e-05, + "loss": 3.0675, + "step": 874000 + }, + { + "epoch": 0.27185105857154623, + "grad_norm": 8.265334129333496, + "learning_rate": 4.5469149023807564e-05, + "loss": 3.0287, + "step": 874500 + }, + { + "epoch": 0.27200649085203316, + "grad_norm": 14.542994499206543, + "learning_rate": 4.546655848579945e-05, + "loss": 3.0553, + "step": 875000 + }, + { + "epoch": 0.27216192313252, + "grad_norm": 9.019545555114746, + "learning_rate": 4.546396794779134e-05, + "loss": 3.0551, + "step": 875500 + }, + { + "epoch": 0.2723173554130069, + "grad_norm": 8.142510414123535, + "learning_rate": 4.5461377409783225e-05, + "loss": 3.0149, + "step": 876000 + }, + { + "epoch": 0.27247278769349376, + "grad_norm": 7.849638938903809, + "learning_rate": 4.5458786871775106e-05, + "loss": 3.1123, + "step": 876500 + }, + { + "epoch": 0.2726282199739806, + "grad_norm": 7.744863510131836, + "learning_rate": 4.545619633376699e-05, + "loss": 3.0461, + "step": 877000 + }, + { + "epoch": 0.2727836522544675, + "grad_norm": 10.651440620422363, + "learning_rate": 4.545360579575887e-05, + "loss": 3.0934, + "step": 877500 + }, + { + "epoch": 0.2729390845349544, + "grad_norm": 7.094332695007324, + "learning_rate": 4.545101525775076e-05, + "loss": 3.056, + "step": 878000 + }, + { + "epoch": 0.2730945168154413, + "grad_norm": 8.582879066467285, + "learning_rate": 4.544842471974265e-05, + "loss": 3.044, + "step": 878500 + }, + { + "epoch": 0.27324994909592815, + "grad_norm": 9.893468856811523, + "learning_rate": 4.544583418173453e-05, + "loss": 3.0466, + "step": 879000 + }, + { + "epoch": 0.273405381376415, + "grad_norm": 8.499627113342285, + "learning_rate": 4.544324364372642e-05, + "loss": 3.0895, + "step": 879500 + }, + { + "epoch": 0.2735608136569019, + "grad_norm": 7.931820869445801, + "learning_rate": 4.544065310571831e-05, + "loss": 3.069, + "step": 880000 + }, + { + "epoch": 0.27371624593738875, + "grad_norm": 14.466115951538086, + "learning_rate": 4.543806256771019e-05, + "loss": 3.0409, + "step": 880500 + }, + { + "epoch": 0.27387167821787567, + "grad_norm": 9.30761432647705, + "learning_rate": 4.543547202970208e-05, + "loss": 3.0362, + "step": 881000 + }, + { + "epoch": 0.27402711049836254, + "grad_norm": 7.96787166595459, + "learning_rate": 4.5432881491693964e-05, + "loss": 3.1257, + "step": 881500 + }, + { + "epoch": 0.2741825427788494, + "grad_norm": 11.107354164123535, + "learning_rate": 4.5430290953685844e-05, + "loss": 3.0333, + "step": 882000 + }, + { + "epoch": 0.27433797505933627, + "grad_norm": 60.41530227661133, + "learning_rate": 4.542770041567773e-05, + "loss": 3.0584, + "step": 882500 + }, + { + "epoch": 0.27449340733982314, + "grad_norm": 7.355981826782227, + "learning_rate": 4.542510987766961e-05, + "loss": 3.0441, + "step": 883000 + }, + { + "epoch": 0.27464883962031, + "grad_norm": 11.277392387390137, + "learning_rate": 4.54225193396615e-05, + "loss": 3.0045, + "step": 883500 + }, + { + "epoch": 0.2748042719007969, + "grad_norm": 16.66672134399414, + "learning_rate": 4.5419928801653386e-05, + "loss": 3.0752, + "step": 884000 + }, + { + "epoch": 0.2749597041812838, + "grad_norm": 7.048306465148926, + "learning_rate": 4.541733826364527e-05, + "loss": 3.0453, + "step": 884500 + }, + { + "epoch": 0.27511513646177066, + "grad_norm": 9.876364707946777, + "learning_rate": 4.541474772563716e-05, + "loss": 3.0529, + "step": 885000 + }, + { + "epoch": 0.2752705687422575, + "grad_norm": 16.353607177734375, + "learning_rate": 4.541215718762905e-05, + "loss": 3.0995, + "step": 885500 + }, + { + "epoch": 0.2754260010227444, + "grad_norm": 8.665085792541504, + "learning_rate": 4.540956664962093e-05, + "loss": 3.0782, + "step": 886000 + }, + { + "epoch": 0.27558143330323126, + "grad_norm": 8.787431716918945, + "learning_rate": 4.5406976111612815e-05, + "loss": 3.1147, + "step": 886500 + }, + { + "epoch": 0.2757368655837182, + "grad_norm": 10.091407775878906, + "learning_rate": 4.54043855736047e-05, + "loss": 3.0544, + "step": 887000 + }, + { + "epoch": 0.27589229786420505, + "grad_norm": 8.68482780456543, + "learning_rate": 4.540179503559658e-05, + "loss": 3.0528, + "step": 887500 + }, + { + "epoch": 0.2760477301446919, + "grad_norm": 7.8241753578186035, + "learning_rate": 4.539920449758847e-05, + "loss": 3.1144, + "step": 888000 + }, + { + "epoch": 0.2762031624251788, + "grad_norm": 9.35127067565918, + "learning_rate": 4.539661395958035e-05, + "loss": 3.0827, + "step": 888500 + }, + { + "epoch": 0.27635859470566565, + "grad_norm": 7.508307933807373, + "learning_rate": 4.539402342157224e-05, + "loss": 3.0555, + "step": 889000 + }, + { + "epoch": 0.2765140269861525, + "grad_norm": 15.493172645568848, + "learning_rate": 4.539143288356413e-05, + "loss": 3.0766, + "step": 889500 + }, + { + "epoch": 0.27666945926663944, + "grad_norm": 13.376007080078125, + "learning_rate": 4.538884234555601e-05, + "loss": 3.0368, + "step": 890000 + }, + { + "epoch": 0.2768248915471263, + "grad_norm": 6.428855895996094, + "learning_rate": 4.53862518075479e-05, + "loss": 3.0786, + "step": 890500 + }, + { + "epoch": 0.2769803238276132, + "grad_norm": 7.621070384979248, + "learning_rate": 4.5383661269539786e-05, + "loss": 3.0668, + "step": 891000 + }, + { + "epoch": 0.27713575610810004, + "grad_norm": 7.439842224121094, + "learning_rate": 4.5381070731531666e-05, + "loss": 3.0717, + "step": 891500 + }, + { + "epoch": 0.2772911883885869, + "grad_norm": 14.79054069519043, + "learning_rate": 4.537848019352355e-05, + "loss": 3.0951, + "step": 892000 + }, + { + "epoch": 0.2774466206690738, + "grad_norm": 6.326071262359619, + "learning_rate": 4.537588965551544e-05, + "loss": 3.0893, + "step": 892500 + }, + { + "epoch": 0.2776020529495607, + "grad_norm": 7.642025947570801, + "learning_rate": 4.537329911750732e-05, + "loss": 3.0427, + "step": 893000 + }, + { + "epoch": 0.27775748523004756, + "grad_norm": 9.861166954040527, + "learning_rate": 4.537070857949921e-05, + "loss": 3.0563, + "step": 893500 + }, + { + "epoch": 0.27791291751053443, + "grad_norm": 26.08842658996582, + "learning_rate": 4.5368118041491095e-05, + "loss": 3.1113, + "step": 894000 + }, + { + "epoch": 0.2780683497910213, + "grad_norm": 6.939576148986816, + "learning_rate": 4.536552750348298e-05, + "loss": 3.036, + "step": 894500 + }, + { + "epoch": 0.27822378207150816, + "grad_norm": 12.821187973022461, + "learning_rate": 4.536293696547487e-05, + "loss": 3.0976, + "step": 895000 + }, + { + "epoch": 0.27837921435199503, + "grad_norm": 7.9253034591674805, + "learning_rate": 4.536034642746675e-05, + "loss": 3.0603, + "step": 895500 + }, + { + "epoch": 0.27853464663248195, + "grad_norm": 6.8864426612854, + "learning_rate": 4.535775588945864e-05, + "loss": 3.0871, + "step": 896000 + }, + { + "epoch": 0.2786900789129688, + "grad_norm": 6.916139602661133, + "learning_rate": 4.5355165351450524e-05, + "loss": 3.0385, + "step": 896500 + }, + { + "epoch": 0.2788455111934557, + "grad_norm": 8.135237693786621, + "learning_rate": 4.5352574813442405e-05, + "loss": 3.023, + "step": 897000 + }, + { + "epoch": 0.27900094347394255, + "grad_norm": 9.262121200561523, + "learning_rate": 4.534998427543429e-05, + "loss": 3.0553, + "step": 897500 + }, + { + "epoch": 0.2791563757544294, + "grad_norm": 6.359518527984619, + "learning_rate": 4.534739373742618e-05, + "loss": 3.0363, + "step": 898000 + }, + { + "epoch": 0.2793118080349163, + "grad_norm": 8.870782852172852, + "learning_rate": 4.534480319941806e-05, + "loss": 3.1026, + "step": 898500 + }, + { + "epoch": 0.2794672403154032, + "grad_norm": 7.386320114135742, + "learning_rate": 4.5342212661409946e-05, + "loss": 3.1088, + "step": 899000 + }, + { + "epoch": 0.2796226725958901, + "grad_norm": 7.5215020179748535, + "learning_rate": 4.533962212340184e-05, + "loss": 3.0465, + "step": 899500 + }, + { + "epoch": 0.27977810487637694, + "grad_norm": 6.3792009353637695, + "learning_rate": 4.533703158539372e-05, + "loss": 3.0708, + "step": 900000 + }, + { + "epoch": 0.2799335371568638, + "grad_norm": 7.439046859741211, + "learning_rate": 4.533444104738561e-05, + "loss": 3.0686, + "step": 900500 + }, + { + "epoch": 0.2800889694373507, + "grad_norm": 7.012774467468262, + "learning_rate": 4.533185050937749e-05, + "loss": 3.0543, + "step": 901000 + }, + { + "epoch": 0.28024440171783754, + "grad_norm": 9.545361518859863, + "learning_rate": 4.5329259971369375e-05, + "loss": 3.0582, + "step": 901500 + }, + { + "epoch": 0.28039983399832447, + "grad_norm": 10.595487594604492, + "learning_rate": 4.532666943336126e-05, + "loss": 3.0676, + "step": 902000 + }, + { + "epoch": 0.28055526627881133, + "grad_norm": 6.860518932342529, + "learning_rate": 4.532407889535314e-05, + "loss": 3.0454, + "step": 902500 + }, + { + "epoch": 0.2807106985592982, + "grad_norm": 7.495266914367676, + "learning_rate": 4.532148835734503e-05, + "loss": 3.0844, + "step": 903000 + }, + { + "epoch": 0.28086613083978507, + "grad_norm": 10.68359088897705, + "learning_rate": 4.531889781933692e-05, + "loss": 3.057, + "step": 903500 + }, + { + "epoch": 0.28102156312027193, + "grad_norm": 5.721611976623535, + "learning_rate": 4.5316307281328804e-05, + "loss": 3.0647, + "step": 904000 + }, + { + "epoch": 0.2811769954007588, + "grad_norm": 32.9315071105957, + "learning_rate": 4.531371674332069e-05, + "loss": 3.0466, + "step": 904500 + }, + { + "epoch": 0.2813324276812457, + "grad_norm": 8.715953826904297, + "learning_rate": 4.531112620531258e-05, + "loss": 3.0481, + "step": 905000 + }, + { + "epoch": 0.2814878599617326, + "grad_norm": 5.335018157958984, + "learning_rate": 4.530853566730446e-05, + "loss": 3.0662, + "step": 905500 + }, + { + "epoch": 0.28164329224221946, + "grad_norm": 8.180047988891602, + "learning_rate": 4.5305945129296346e-05, + "loss": 3.0898, + "step": 906000 + }, + { + "epoch": 0.2817987245227063, + "grad_norm": 6.8043928146362305, + "learning_rate": 4.5303354591288227e-05, + "loss": 3.1, + "step": 906500 + }, + { + "epoch": 0.2819541568031932, + "grad_norm": 7.003060817718506, + "learning_rate": 4.5300764053280114e-05, + "loss": 3.0668, + "step": 907000 + }, + { + "epoch": 0.28210958908368006, + "grad_norm": 12.930397033691406, + "learning_rate": 4.5298173515272e-05, + "loss": 3.0443, + "step": 907500 + }, + { + "epoch": 0.282265021364167, + "grad_norm": 9.370779037475586, + "learning_rate": 4.529558297726388e-05, + "loss": 2.9955, + "step": 908000 + }, + { + "epoch": 0.28242045364465385, + "grad_norm": 7.289252758026123, + "learning_rate": 4.529299243925577e-05, + "loss": 3.0593, + "step": 908500 + }, + { + "epoch": 0.2825758859251407, + "grad_norm": 10.605660438537598, + "learning_rate": 4.5290401901247656e-05, + "loss": 3.0408, + "step": 909000 + }, + { + "epoch": 0.2827313182056276, + "grad_norm": 17.33424186706543, + "learning_rate": 4.528781136323954e-05, + "loss": 3.0664, + "step": 909500 + }, + { + "epoch": 0.28288675048611445, + "grad_norm": 8.798460006713867, + "learning_rate": 4.528522082523143e-05, + "loss": 3.0965, + "step": 910000 + }, + { + "epoch": 0.2830421827666013, + "grad_norm": 7.119205951690674, + "learning_rate": 4.528263028722332e-05, + "loss": 3.0931, + "step": 910500 + }, + { + "epoch": 0.28319761504708824, + "grad_norm": 7.587025165557861, + "learning_rate": 4.52800397492152e-05, + "loss": 3.0522, + "step": 911000 + }, + { + "epoch": 0.2833530473275751, + "grad_norm": 11.848862648010254, + "learning_rate": 4.5277449211207085e-05, + "loss": 3.0617, + "step": 911500 + }, + { + "epoch": 0.28350847960806197, + "grad_norm": 11.582472801208496, + "learning_rate": 4.527485867319897e-05, + "loss": 3.0672, + "step": 912000 + }, + { + "epoch": 0.28366391188854884, + "grad_norm": 17.75034523010254, + "learning_rate": 4.527226813519085e-05, + "loss": 3.067, + "step": 912500 + }, + { + "epoch": 0.2838193441690357, + "grad_norm": 14.569832801818848, + "learning_rate": 4.526967759718274e-05, + "loss": 3.0548, + "step": 913000 + }, + { + "epoch": 0.28397477644952257, + "grad_norm": 7.474301815032959, + "learning_rate": 4.5267087059174626e-05, + "loss": 3.0599, + "step": 913500 + }, + { + "epoch": 0.2841302087300095, + "grad_norm": 7.761072635650635, + "learning_rate": 4.5264496521166514e-05, + "loss": 3.0728, + "step": 914000 + }, + { + "epoch": 0.28428564101049636, + "grad_norm": 9.083037376403809, + "learning_rate": 4.52619059831584e-05, + "loss": 3.0993, + "step": 914500 + }, + { + "epoch": 0.2844410732909832, + "grad_norm": 9.181802749633789, + "learning_rate": 4.525931544515028e-05, + "loss": 3.0759, + "step": 915000 + }, + { + "epoch": 0.2845965055714701, + "grad_norm": 6.313485622406006, + "learning_rate": 4.525672490714217e-05, + "loss": 3.0338, + "step": 915500 + }, + { + "epoch": 0.28475193785195696, + "grad_norm": 9.060997009277344, + "learning_rate": 4.5254134369134055e-05, + "loss": 3.0623, + "step": 916000 + }, + { + "epoch": 0.2849073701324438, + "grad_norm": 9.198747634887695, + "learning_rate": 4.5251543831125936e-05, + "loss": 3.0125, + "step": 916500 + }, + { + "epoch": 0.28506280241293075, + "grad_norm": 7.651142120361328, + "learning_rate": 4.524895329311782e-05, + "loss": 3.0532, + "step": 917000 + }, + { + "epoch": 0.2852182346934176, + "grad_norm": 8.47026538848877, + "learning_rate": 4.524636275510971e-05, + "loss": 3.0528, + "step": 917500 + }, + { + "epoch": 0.2853736669739045, + "grad_norm": 8.749290466308594, + "learning_rate": 4.524377221710159e-05, + "loss": 3.1117, + "step": 918000 + }, + { + "epoch": 0.28552909925439135, + "grad_norm": 8.539522171020508, + "learning_rate": 4.524118167909348e-05, + "loss": 3.0501, + "step": 918500 + }, + { + "epoch": 0.2856845315348782, + "grad_norm": 6.681888103485107, + "learning_rate": 4.5238591141085365e-05, + "loss": 3.0654, + "step": 919000 + }, + { + "epoch": 0.2858399638153651, + "grad_norm": 15.017863273620605, + "learning_rate": 4.523600060307725e-05, + "loss": 3.0711, + "step": 919500 + }, + { + "epoch": 0.285995396095852, + "grad_norm": 8.648761749267578, + "learning_rate": 4.523341006506914e-05, + "loss": 3.0251, + "step": 920000 + }, + { + "epoch": 0.2861508283763389, + "grad_norm": 9.540538787841797, + "learning_rate": 4.523081952706102e-05, + "loss": 3.0819, + "step": 920500 + }, + { + "epoch": 0.28630626065682574, + "grad_norm": 7.2777533531188965, + "learning_rate": 4.522822898905291e-05, + "loss": 3.0609, + "step": 921000 + }, + { + "epoch": 0.2864616929373126, + "grad_norm": 7.687942028045654, + "learning_rate": 4.5225638451044794e-05, + "loss": 3.0636, + "step": 921500 + }, + { + "epoch": 0.2866171252177995, + "grad_norm": 7.193161964416504, + "learning_rate": 4.5223047913036674e-05, + "loss": 3.0522, + "step": 922000 + }, + { + "epoch": 0.28677255749828634, + "grad_norm": 6.960118293762207, + "learning_rate": 4.522045737502856e-05, + "loss": 3.0276, + "step": 922500 + }, + { + "epoch": 0.28692798977877326, + "grad_norm": 7.526091575622559, + "learning_rate": 4.521786683702045e-05, + "loss": 3.0255, + "step": 923000 + }, + { + "epoch": 0.28708342205926013, + "grad_norm": 8.078575134277344, + "learning_rate": 4.5215276299012336e-05, + "loss": 3.0875, + "step": 923500 + }, + { + "epoch": 0.287238854339747, + "grad_norm": 12.096606254577637, + "learning_rate": 4.521268576100422e-05, + "loss": 3.0559, + "step": 924000 + }, + { + "epoch": 0.28739428662023386, + "grad_norm": 7.544878959655762, + "learning_rate": 4.52100952229961e-05, + "loss": 3.0986, + "step": 924500 + }, + { + "epoch": 0.28754971890072073, + "grad_norm": 7.719270706176758, + "learning_rate": 4.520750468498799e-05, + "loss": 3.0563, + "step": 925000 + }, + { + "epoch": 0.2877051511812076, + "grad_norm": 8.317971229553223, + "learning_rate": 4.520491414697988e-05, + "loss": 3.0687, + "step": 925500 + }, + { + "epoch": 0.2878605834616945, + "grad_norm": 8.58825969696045, + "learning_rate": 4.520232360897176e-05, + "loss": 3.0521, + "step": 926000 + }, + { + "epoch": 0.2880160157421814, + "grad_norm": 7.996367454528809, + "learning_rate": 4.5199733070963645e-05, + "loss": 3.0349, + "step": 926500 + }, + { + "epoch": 0.28817144802266825, + "grad_norm": 7.939483642578125, + "learning_rate": 4.519714253295553e-05, + "loss": 3.0673, + "step": 927000 + }, + { + "epoch": 0.2883268803031551, + "grad_norm": 16.426708221435547, + "learning_rate": 4.519455199494741e-05, + "loss": 3.0837, + "step": 927500 + }, + { + "epoch": 0.288482312583642, + "grad_norm": 7.622366905212402, + "learning_rate": 4.51919614569393e-05, + "loss": 3.0793, + "step": 928000 + }, + { + "epoch": 0.28863774486412885, + "grad_norm": 15.259408950805664, + "learning_rate": 4.518937091893119e-05, + "loss": 3.094, + "step": 928500 + }, + { + "epoch": 0.2887931771446158, + "grad_norm": 7.1601338386535645, + "learning_rate": 4.5186780380923074e-05, + "loss": 3.02, + "step": 929000 + }, + { + "epoch": 0.28894860942510264, + "grad_norm": 7.088866233825684, + "learning_rate": 4.518418984291496e-05, + "loss": 3.1184, + "step": 929500 + }, + { + "epoch": 0.2891040417055895, + "grad_norm": 8.780591011047363, + "learning_rate": 4.518159930490685e-05, + "loss": 3.0251, + "step": 930000 + }, + { + "epoch": 0.2892594739860764, + "grad_norm": 22.30752944946289, + "learning_rate": 4.517900876689873e-05, + "loss": 3.1012, + "step": 930500 + }, + { + "epoch": 0.28941490626656324, + "grad_norm": 7.707304000854492, + "learning_rate": 4.5176418228890616e-05, + "loss": 3.0686, + "step": 931000 + }, + { + "epoch": 0.2895703385470501, + "grad_norm": 9.870542526245117, + "learning_rate": 4.5173827690882496e-05, + "loss": 3.0557, + "step": 931500 + }, + { + "epoch": 0.289725770827537, + "grad_norm": 7.852762699127197, + "learning_rate": 4.517123715287438e-05, + "loss": 3.0937, + "step": 932000 + }, + { + "epoch": 0.2898812031080239, + "grad_norm": 9.010544776916504, + "learning_rate": 4.516864661486627e-05, + "loss": 3.0857, + "step": 932500 + }, + { + "epoch": 0.29003663538851077, + "grad_norm": 14.694048881530762, + "learning_rate": 4.516605607685815e-05, + "loss": 3.1157, + "step": 933000 + }, + { + "epoch": 0.29019206766899763, + "grad_norm": 10.392428398132324, + "learning_rate": 4.5163465538850045e-05, + "loss": 3.0629, + "step": 933500 + }, + { + "epoch": 0.2903474999494845, + "grad_norm": 7.9094953536987305, + "learning_rate": 4.516087500084193e-05, + "loss": 3.0312, + "step": 934000 + }, + { + "epoch": 0.29050293222997137, + "grad_norm": 13.239105224609375, + "learning_rate": 4.515828446283381e-05, + "loss": 3.0277, + "step": 934500 + }, + { + "epoch": 0.29065836451045823, + "grad_norm": 7.226293087005615, + "learning_rate": 4.51556939248257e-05, + "loss": 3.0454, + "step": 935000 + }, + { + "epoch": 0.29081379679094516, + "grad_norm": 7.6093950271606445, + "learning_rate": 4.515310338681759e-05, + "loss": 3.0405, + "step": 935500 + }, + { + "epoch": 0.290969229071432, + "grad_norm": 32.42957305908203, + "learning_rate": 4.515051284880947e-05, + "loss": 3.1119, + "step": 936000 + }, + { + "epoch": 0.2911246613519189, + "grad_norm": 9.104327201843262, + "learning_rate": 4.5147922310801354e-05, + "loss": 3.0194, + "step": 936500 + }, + { + "epoch": 0.29128009363240576, + "grad_norm": 9.065271377563477, + "learning_rate": 4.5145331772793235e-05, + "loss": 3.0373, + "step": 937000 + }, + { + "epoch": 0.2914355259128926, + "grad_norm": 12.262693405151367, + "learning_rate": 4.514274123478512e-05, + "loss": 3.1115, + "step": 937500 + }, + { + "epoch": 0.2915909581933795, + "grad_norm": 11.424263000488281, + "learning_rate": 4.514015069677701e-05, + "loss": 3.082, + "step": 938000 + }, + { + "epoch": 0.2917463904738664, + "grad_norm": 5.205952167510986, + "learning_rate": 4.5137560158768896e-05, + "loss": 3.0165, + "step": 938500 + }, + { + "epoch": 0.2919018227543533, + "grad_norm": 9.083389282226562, + "learning_rate": 4.513496962076078e-05, + "loss": 3.0745, + "step": 939000 + }, + { + "epoch": 0.29205725503484015, + "grad_norm": 13.72270393371582, + "learning_rate": 4.513237908275267e-05, + "loss": 3.0662, + "step": 939500 + }, + { + "epoch": 0.292212687315327, + "grad_norm": 8.252921104431152, + "learning_rate": 4.512978854474455e-05, + "loss": 3.047, + "step": 940000 + }, + { + "epoch": 0.2923681195958139, + "grad_norm": 7.795941352844238, + "learning_rate": 4.512719800673644e-05, + "loss": 3.0372, + "step": 940500 + }, + { + "epoch": 0.29252355187630075, + "grad_norm": 11.442863464355469, + "learning_rate": 4.5124607468728325e-05, + "loss": 3.0377, + "step": 941000 + }, + { + "epoch": 0.29267898415678767, + "grad_norm": 11.503290176391602, + "learning_rate": 4.5122016930720205e-05, + "loss": 3.0401, + "step": 941500 + }, + { + "epoch": 0.29283441643727454, + "grad_norm": 13.745347023010254, + "learning_rate": 4.511942639271209e-05, + "loss": 3.0586, + "step": 942000 + }, + { + "epoch": 0.2929898487177614, + "grad_norm": 10.687305450439453, + "learning_rate": 4.511683585470398e-05, + "loss": 3.0533, + "step": 942500 + }, + { + "epoch": 0.29314528099824827, + "grad_norm": 9.663128852844238, + "learning_rate": 4.511424531669586e-05, + "loss": 3.0932, + "step": 943000 + }, + { + "epoch": 0.29330071327873514, + "grad_norm": 10.81393051147461, + "learning_rate": 4.5111654778687754e-05, + "loss": 3.0414, + "step": 943500 + }, + { + "epoch": 0.293456145559222, + "grad_norm": 10.118293762207031, + "learning_rate": 4.5109064240679634e-05, + "loss": 3.0376, + "step": 944000 + }, + { + "epoch": 0.2936115778397089, + "grad_norm": 11.57665729522705, + "learning_rate": 4.510647370267152e-05, + "loss": 3.0596, + "step": 944500 + }, + { + "epoch": 0.2937670101201958, + "grad_norm": 8.796860694885254, + "learning_rate": 4.510388316466341e-05, + "loss": 3.0286, + "step": 945000 + }, + { + "epoch": 0.29392244240068266, + "grad_norm": 11.744518280029297, + "learning_rate": 4.510129262665529e-05, + "loss": 3.0293, + "step": 945500 + }, + { + "epoch": 0.2940778746811695, + "grad_norm": 9.780899047851562, + "learning_rate": 4.5098702088647176e-05, + "loss": 3.0375, + "step": 946000 + }, + { + "epoch": 0.2942333069616564, + "grad_norm": 7.7592573165893555, + "learning_rate": 4.509611155063906e-05, + "loss": 3.0845, + "step": 946500 + }, + { + "epoch": 0.29438873924214326, + "grad_norm": 8.0684232711792, + "learning_rate": 4.5093521012630944e-05, + "loss": 3.0521, + "step": 947000 + }, + { + "epoch": 0.2945441715226302, + "grad_norm": 7.912718296051025, + "learning_rate": 4.509093047462283e-05, + "loss": 3.1034, + "step": 947500 + }, + { + "epoch": 0.29469960380311705, + "grad_norm": 8.39204216003418, + "learning_rate": 4.508833993661472e-05, + "loss": 3.0758, + "step": 948000 + }, + { + "epoch": 0.2948550360836039, + "grad_norm": 10.325467109680176, + "learning_rate": 4.5085749398606605e-05, + "loss": 3.0766, + "step": 948500 + }, + { + "epoch": 0.2950104683640908, + "grad_norm": 8.820301055908203, + "learning_rate": 4.508315886059849e-05, + "loss": 3.0692, + "step": 949000 + }, + { + "epoch": 0.29516590064457765, + "grad_norm": 8.23974609375, + "learning_rate": 4.508056832259037e-05, + "loss": 3.0568, + "step": 949500 + }, + { + "epoch": 0.2953213329250645, + "grad_norm": 8.262386322021484, + "learning_rate": 4.507797778458226e-05, + "loss": 3.1034, + "step": 950000 + }, + { + "epoch": 0.29547676520555144, + "grad_norm": 6.818932056427002, + "learning_rate": 4.507538724657415e-05, + "loss": 3.0859, + "step": 950500 + }, + { + "epoch": 0.2956321974860383, + "grad_norm": 6.265594005584717, + "learning_rate": 4.507279670856603e-05, + "loss": 3.0839, + "step": 951000 + }, + { + "epoch": 0.2957876297665252, + "grad_norm": 9.35272216796875, + "learning_rate": 4.5070206170557915e-05, + "loss": 3.0596, + "step": 951500 + }, + { + "epoch": 0.29594306204701204, + "grad_norm": 9.586097717285156, + "learning_rate": 4.50676156325498e-05, + "loss": 3.0704, + "step": 952000 + }, + { + "epoch": 0.2960984943274989, + "grad_norm": 7.723526954650879, + "learning_rate": 4.506502509454168e-05, + "loss": 3.0235, + "step": 952500 + }, + { + "epoch": 0.2962539266079858, + "grad_norm": 10.211925506591797, + "learning_rate": 4.506243455653357e-05, + "loss": 3.0247, + "step": 953000 + }, + { + "epoch": 0.2964093588884727, + "grad_norm": 7.253708362579346, + "learning_rate": 4.505984401852546e-05, + "loss": 3.0712, + "step": 953500 + }, + { + "epoch": 0.29656479116895956, + "grad_norm": 8.878169059753418, + "learning_rate": 4.5057253480517344e-05, + "loss": 2.9993, + "step": 954000 + }, + { + "epoch": 0.29672022344944643, + "grad_norm": 8.073410034179688, + "learning_rate": 4.505466294250923e-05, + "loss": 3.1013, + "step": 954500 + }, + { + "epoch": 0.2968756557299333, + "grad_norm": 7.911851406097412, + "learning_rate": 4.505207240450111e-05, + "loss": 3.0564, + "step": 955000 + }, + { + "epoch": 0.29703108801042016, + "grad_norm": 7.706331729888916, + "learning_rate": 4.5049481866493e-05, + "loss": 3.0441, + "step": 955500 + }, + { + "epoch": 0.29718652029090703, + "grad_norm": 12.064401626586914, + "learning_rate": 4.5046891328484885e-05, + "loss": 3.0809, + "step": 956000 + }, + { + "epoch": 0.29734195257139395, + "grad_norm": 7.766979217529297, + "learning_rate": 4.5044300790476766e-05, + "loss": 3.0702, + "step": 956500 + }, + { + "epoch": 0.2974973848518808, + "grad_norm": 12.716899871826172, + "learning_rate": 4.504171025246865e-05, + "loss": 3.0841, + "step": 957000 + }, + { + "epoch": 0.2976528171323677, + "grad_norm": 14.513644218444824, + "learning_rate": 4.503911971446054e-05, + "loss": 3.1154, + "step": 957500 + }, + { + "epoch": 0.29780824941285455, + "grad_norm": 7.1412177085876465, + "learning_rate": 4.503652917645243e-05, + "loss": 3.0286, + "step": 958000 + }, + { + "epoch": 0.2979636816933414, + "grad_norm": 8.508959770202637, + "learning_rate": 4.5033938638444314e-05, + "loss": 3.0325, + "step": 958500 + }, + { + "epoch": 0.2981191139738283, + "grad_norm": 8.644041061401367, + "learning_rate": 4.50313481004362e-05, + "loss": 3.0477, + "step": 959000 + }, + { + "epoch": 0.2982745462543152, + "grad_norm": 7.87005090713501, + "learning_rate": 4.502875756242808e-05, + "loss": 3.0422, + "step": 959500 + }, + { + "epoch": 0.2984299785348021, + "grad_norm": 7.756917476654053, + "learning_rate": 4.502616702441997e-05, + "loss": 3.0028, + "step": 960000 + }, + { + "epoch": 0.29858541081528894, + "grad_norm": 7.469654560089111, + "learning_rate": 4.5023576486411856e-05, + "loss": 3.0946, + "step": 960500 + }, + { + "epoch": 0.2987408430957758, + "grad_norm": 5.917862415313721, + "learning_rate": 4.5020985948403737e-05, + "loss": 3.0625, + "step": 961000 + }, + { + "epoch": 0.2988962753762627, + "grad_norm": 6.552438259124756, + "learning_rate": 4.5018395410395624e-05, + "loss": 3.0686, + "step": 961500 + }, + { + "epoch": 0.29905170765674954, + "grad_norm": 6.348852634429932, + "learning_rate": 4.5015804872387504e-05, + "loss": 3.0704, + "step": 962000 + }, + { + "epoch": 0.29920713993723647, + "grad_norm": 7.846667766571045, + "learning_rate": 4.501321433437939e-05, + "loss": 3.0852, + "step": 962500 + }, + { + "epoch": 0.29936257221772333, + "grad_norm": 7.648201942443848, + "learning_rate": 4.501062379637128e-05, + "loss": 3.072, + "step": 963000 + }, + { + "epoch": 0.2995180044982102, + "grad_norm": 14.132330894470215, + "learning_rate": 4.5008033258363166e-05, + "loss": 3.0944, + "step": 963500 + }, + { + "epoch": 0.29967343677869707, + "grad_norm": 9.43336009979248, + "learning_rate": 4.500544272035505e-05, + "loss": 3.0715, + "step": 964000 + }, + { + "epoch": 0.29982886905918393, + "grad_norm": 7.927108287811279, + "learning_rate": 4.500285218234694e-05, + "loss": 3.0396, + "step": 964500 + }, + { + "epoch": 0.2999843013396708, + "grad_norm": 8.218104362487793, + "learning_rate": 4.500026164433882e-05, + "loss": 3.0294, + "step": 965000 + }, + { + "epoch": 0.3001397336201577, + "grad_norm": 8.968445777893066, + "learning_rate": 4.499767110633071e-05, + "loss": 3.0726, + "step": 965500 + }, + { + "epoch": 0.3002951659006446, + "grad_norm": 9.64793872833252, + "learning_rate": 4.4995080568322595e-05, + "loss": 3.0631, + "step": 966000 + }, + { + "epoch": 0.30045059818113146, + "grad_norm": 8.018625259399414, + "learning_rate": 4.4992490030314475e-05, + "loss": 3.0799, + "step": 966500 + }, + { + "epoch": 0.3006060304616183, + "grad_norm": 27.074661254882812, + "learning_rate": 4.498989949230636e-05, + "loss": 3.0864, + "step": 967000 + }, + { + "epoch": 0.3007614627421052, + "grad_norm": 15.78603744506836, + "learning_rate": 4.498730895429825e-05, + "loss": 3.0648, + "step": 967500 + }, + { + "epoch": 0.30091689502259206, + "grad_norm": 8.16176700592041, + "learning_rate": 4.4984718416290136e-05, + "loss": 3.1048, + "step": 968000 + }, + { + "epoch": 0.301072327303079, + "grad_norm": 8.061563491821289, + "learning_rate": 4.4982127878282024e-05, + "loss": 3.0117, + "step": 968500 + }, + { + "epoch": 0.30122775958356585, + "grad_norm": 7.857732772827148, + "learning_rate": 4.4979537340273904e-05, + "loss": 3.038, + "step": 969000 + }, + { + "epoch": 0.3013831918640527, + "grad_norm": 7.400825500488281, + "learning_rate": 4.497694680226579e-05, + "loss": 3.0692, + "step": 969500 + }, + { + "epoch": 0.3015386241445396, + "grad_norm": 10.429243087768555, + "learning_rate": 4.497435626425768e-05, + "loss": 3.1006, + "step": 970000 + }, + { + "epoch": 0.30169405642502645, + "grad_norm": 7.526261329650879, + "learning_rate": 4.497176572624956e-05, + "loss": 3.1011, + "step": 970500 + }, + { + "epoch": 0.3018494887055133, + "grad_norm": 8.1680908203125, + "learning_rate": 4.4969175188241446e-05, + "loss": 3.0866, + "step": 971000 + }, + { + "epoch": 0.30200492098600024, + "grad_norm": 7.015842437744141, + "learning_rate": 4.496658465023333e-05, + "loss": 3.0491, + "step": 971500 + }, + { + "epoch": 0.3021603532664871, + "grad_norm": 7.782586574554443, + "learning_rate": 4.496399411222521e-05, + "loss": 3.091, + "step": 972000 + }, + { + "epoch": 0.30231578554697397, + "grad_norm": 9.652567863464355, + "learning_rate": 4.49614035742171e-05, + "loss": 3.0341, + "step": 972500 + }, + { + "epoch": 0.30247121782746084, + "grad_norm": 7.59584379196167, + "learning_rate": 4.495881303620899e-05, + "loss": 3.0453, + "step": 973000 + }, + { + "epoch": 0.3026266501079477, + "grad_norm": 8.352418899536133, + "learning_rate": 4.4956222498200875e-05, + "loss": 3.0363, + "step": 973500 + }, + { + "epoch": 0.30278208238843457, + "grad_norm": 7.583561420440674, + "learning_rate": 4.495363196019276e-05, + "loss": 3.0784, + "step": 974000 + }, + { + "epoch": 0.3029375146689215, + "grad_norm": 6.690750598907471, + "learning_rate": 4.495104142218464e-05, + "loss": 3.0355, + "step": 974500 + }, + { + "epoch": 0.30309294694940836, + "grad_norm": 12.785087585449219, + "learning_rate": 4.494845088417653e-05, + "loss": 3.0667, + "step": 975000 + }, + { + "epoch": 0.3032483792298952, + "grad_norm": 9.672938346862793, + "learning_rate": 4.4945860346168417e-05, + "loss": 3.0495, + "step": 975500 + }, + { + "epoch": 0.3034038115103821, + "grad_norm": 9.910211563110352, + "learning_rate": 4.49432698081603e-05, + "loss": 3.05, + "step": 976000 + }, + { + "epoch": 0.30355924379086896, + "grad_norm": 9.72908878326416, + "learning_rate": 4.4940679270152184e-05, + "loss": 3.065, + "step": 976500 + }, + { + "epoch": 0.3037146760713558, + "grad_norm": 6.8822174072265625, + "learning_rate": 4.493808873214407e-05, + "loss": 3.0284, + "step": 977000 + }, + { + "epoch": 0.30387010835184275, + "grad_norm": 7.982511520385742, + "learning_rate": 4.493549819413596e-05, + "loss": 3.0707, + "step": 977500 + }, + { + "epoch": 0.3040255406323296, + "grad_norm": 8.009747505187988, + "learning_rate": 4.4932907656127846e-05, + "loss": 3.0499, + "step": 978000 + }, + { + "epoch": 0.3041809729128165, + "grad_norm": 56.437171936035156, + "learning_rate": 4.493031711811973e-05, + "loss": 3.0519, + "step": 978500 + }, + { + "epoch": 0.30433640519330335, + "grad_norm": 6.742048740386963, + "learning_rate": 4.492772658011161e-05, + "loss": 3.0544, + "step": 979000 + }, + { + "epoch": 0.3044918374737902, + "grad_norm": 9.653396606445312, + "learning_rate": 4.49251360421035e-05, + "loss": 3.0198, + "step": 979500 + }, + { + "epoch": 0.3046472697542771, + "grad_norm": 6.676183223724365, + "learning_rate": 4.492254550409538e-05, + "loss": 3.0588, + "step": 980000 + }, + { + "epoch": 0.304802702034764, + "grad_norm": 10.533844947814941, + "learning_rate": 4.491995496608727e-05, + "loss": 3.0076, + "step": 980500 + }, + { + "epoch": 0.3049581343152509, + "grad_norm": 8.908109664916992, + "learning_rate": 4.4917364428079155e-05, + "loss": 3.0561, + "step": 981000 + }, + { + "epoch": 0.30511356659573774, + "grad_norm": 9.523250579833984, + "learning_rate": 4.4914773890071035e-05, + "loss": 3.0438, + "step": 981500 + }, + { + "epoch": 0.3052689988762246, + "grad_norm": 8.367324829101562, + "learning_rate": 4.491218335206292e-05, + "loss": 3.0385, + "step": 982000 + }, + { + "epoch": 0.3054244311567115, + "grad_norm": 8.721476554870605, + "learning_rate": 4.490959281405481e-05, + "loss": 3.041, + "step": 982500 + }, + { + "epoch": 0.30557986343719834, + "grad_norm": 8.01758861541748, + "learning_rate": 4.49070022760467e-05, + "loss": 3.0399, + "step": 983000 + }, + { + "epoch": 0.30573529571768526, + "grad_norm": 7.3933587074279785, + "learning_rate": 4.4904411738038584e-05, + "loss": 3.0592, + "step": 983500 + }, + { + "epoch": 0.30589072799817213, + "grad_norm": 11.307307243347168, + "learning_rate": 4.490182120003047e-05, + "loss": 3.0433, + "step": 984000 + }, + { + "epoch": 0.306046160278659, + "grad_norm": 6.47851037979126, + "learning_rate": 4.489923066202235e-05, + "loss": 3.0706, + "step": 984500 + }, + { + "epoch": 0.30620159255914586, + "grad_norm": 8.475309371948242, + "learning_rate": 4.489664012401424e-05, + "loss": 3.0581, + "step": 985000 + }, + { + "epoch": 0.30635702483963273, + "grad_norm": 7.058174133300781, + "learning_rate": 4.489404958600612e-05, + "loss": 3.0579, + "step": 985500 + }, + { + "epoch": 0.3065124571201196, + "grad_norm": 11.761174201965332, + "learning_rate": 4.4891459047998006e-05, + "loss": 3.0779, + "step": 986000 + }, + { + "epoch": 0.3066678894006065, + "grad_norm": 8.137330055236816, + "learning_rate": 4.488886850998989e-05, + "loss": 3.0133, + "step": 986500 + }, + { + "epoch": 0.3068233216810934, + "grad_norm": 7.8293561935424805, + "learning_rate": 4.488627797198178e-05, + "loss": 3.0734, + "step": 987000 + }, + { + "epoch": 0.30697875396158025, + "grad_norm": 9.007035255432129, + "learning_rate": 4.488368743397367e-05, + "loss": 3.0263, + "step": 987500 + }, + { + "epoch": 0.3071341862420671, + "grad_norm": 13.308154106140137, + "learning_rate": 4.4881096895965555e-05, + "loss": 3.044, + "step": 988000 + }, + { + "epoch": 0.307289618522554, + "grad_norm": 7.411919116973877, + "learning_rate": 4.4878506357957435e-05, + "loss": 3.0486, + "step": 988500 + }, + { + "epoch": 0.30744505080304085, + "grad_norm": 8.173084259033203, + "learning_rate": 4.487591581994932e-05, + "loss": 3.0486, + "step": 989000 + }, + { + "epoch": 0.3076004830835278, + "grad_norm": 10.89287281036377, + "learning_rate": 4.487332528194121e-05, + "loss": 3.0267, + "step": 989500 + }, + { + "epoch": 0.30775591536401464, + "grad_norm": 8.322940826416016, + "learning_rate": 4.487073474393309e-05, + "loss": 3.0666, + "step": 990000 + }, + { + "epoch": 0.3079113476445015, + "grad_norm": 21.962650299072266, + "learning_rate": 4.486814420592498e-05, + "loss": 3.0409, + "step": 990500 + }, + { + "epoch": 0.3080667799249884, + "grad_norm": 7.698042392730713, + "learning_rate": 4.486555366791686e-05, + "loss": 3.0082, + "step": 991000 + }, + { + "epoch": 0.30822221220547524, + "grad_norm": 7.348971843719482, + "learning_rate": 4.4862963129908745e-05, + "loss": 3.0625, + "step": 991500 + }, + { + "epoch": 0.3083776444859621, + "grad_norm": 7.37475061416626, + "learning_rate": 4.486037259190063e-05, + "loss": 3.0672, + "step": 992000 + }, + { + "epoch": 0.30853307676644903, + "grad_norm": 6.9253315925598145, + "learning_rate": 4.485778205389252e-05, + "loss": 3.0679, + "step": 992500 + }, + { + "epoch": 0.3086885090469359, + "grad_norm": 10.303685188293457, + "learning_rate": 4.4855191515884406e-05, + "loss": 3.0644, + "step": 993000 + }, + { + "epoch": 0.30884394132742277, + "grad_norm": 8.291455268859863, + "learning_rate": 4.485260097787629e-05, + "loss": 3.0641, + "step": 993500 + }, + { + "epoch": 0.30899937360790963, + "grad_norm": 6.720599174499512, + "learning_rate": 4.4850010439868174e-05, + "loss": 3.0563, + "step": 994000 + }, + { + "epoch": 0.3091548058883965, + "grad_norm": 8.459346771240234, + "learning_rate": 4.484741990186006e-05, + "loss": 3.0677, + "step": 994500 + }, + { + "epoch": 0.30931023816888337, + "grad_norm": 8.289113998413086, + "learning_rate": 4.484482936385195e-05, + "loss": 3.058, + "step": 995000 + }, + { + "epoch": 0.3094656704493703, + "grad_norm": 7.521881103515625, + "learning_rate": 4.484223882584383e-05, + "loss": 3.0229, + "step": 995500 + }, + { + "epoch": 0.30962110272985716, + "grad_norm": 7.0714850425720215, + "learning_rate": 4.4839648287835715e-05, + "loss": 3.0329, + "step": 996000 + }, + { + "epoch": 0.309776535010344, + "grad_norm": 10.034725189208984, + "learning_rate": 4.48370577498276e-05, + "loss": 3.0302, + "step": 996500 + }, + { + "epoch": 0.3099319672908309, + "grad_norm": 8.962692260742188, + "learning_rate": 4.483446721181949e-05, + "loss": 3.0321, + "step": 997000 + }, + { + "epoch": 0.31008739957131776, + "grad_norm": 10.185530662536621, + "learning_rate": 4.483187667381138e-05, + "loss": 3.0761, + "step": 997500 + }, + { + "epoch": 0.3102428318518046, + "grad_norm": 7.457888126373291, + "learning_rate": 4.482928613580326e-05, + "loss": 3.0088, + "step": 998000 + }, + { + "epoch": 0.31039826413229155, + "grad_norm": 12.90689468383789, + "learning_rate": 4.4826695597795144e-05, + "loss": 3.0695, + "step": 998500 + }, + { + "epoch": 0.3105536964127784, + "grad_norm": 10.142727851867676, + "learning_rate": 4.482410505978703e-05, + "loss": 3.0695, + "step": 999000 + }, + { + "epoch": 0.3107091286932653, + "grad_norm": 8.061278343200684, + "learning_rate": 4.482151452177891e-05, + "loss": 3.0518, + "step": 999500 + }, + { + "epoch": 0.31086456097375215, + "grad_norm": 10.38846492767334, + "learning_rate": 4.48189239837708e-05, + "loss": 3.0745, + "step": 1000000 + }, + { + "epoch": 0.311019993254239, + "grad_norm": 6.842511177062988, + "learning_rate": 4.4816333445762686e-05, + "loss": 3.0331, + "step": 1000500 + }, + { + "epoch": 0.3111754255347259, + "grad_norm": 8.766606330871582, + "learning_rate": 4.4813742907754567e-05, + "loss": 3.0514, + "step": 1001000 + }, + { + "epoch": 0.3113308578152128, + "grad_norm": 7.9612345695495605, + "learning_rate": 4.4811152369746454e-05, + "loss": 3.0231, + "step": 1001500 + }, + { + "epoch": 0.31148629009569967, + "grad_norm": 18.88245391845703, + "learning_rate": 4.480856183173834e-05, + "loss": 3.0892, + "step": 1002000 + }, + { + "epoch": 0.31164172237618654, + "grad_norm": 8.376673698425293, + "learning_rate": 4.480597129373023e-05, + "loss": 3.0148, + "step": 1002500 + }, + { + "epoch": 0.3117971546566734, + "grad_norm": 7.3854522705078125, + "learning_rate": 4.4803380755722115e-05, + "loss": 3.0722, + "step": 1003000 + }, + { + "epoch": 0.31195258693716027, + "grad_norm": 7.921773910522461, + "learning_rate": 4.4800790217713996e-05, + "loss": 3.0773, + "step": 1003500 + }, + { + "epoch": 0.31210801921764714, + "grad_norm": 8.099723815917969, + "learning_rate": 4.479819967970588e-05, + "loss": 3.0581, + "step": 1004000 + }, + { + "epoch": 0.31226345149813406, + "grad_norm": 7.757390022277832, + "learning_rate": 4.479560914169777e-05, + "loss": 3.0641, + "step": 1004500 + }, + { + "epoch": 0.3124188837786209, + "grad_norm": 15.17562198638916, + "learning_rate": 4.479301860368965e-05, + "loss": 3.0477, + "step": 1005000 + }, + { + "epoch": 0.3125743160591078, + "grad_norm": 8.026070594787598, + "learning_rate": 4.479042806568154e-05, + "loss": 3.0092, + "step": 1005500 + }, + { + "epoch": 0.31272974833959466, + "grad_norm": 7.295668125152588, + "learning_rate": 4.4787837527673425e-05, + "loss": 3.0754, + "step": 1006000 + }, + { + "epoch": 0.3128851806200815, + "grad_norm": 9.316947937011719, + "learning_rate": 4.4785246989665305e-05, + "loss": 3.045, + "step": 1006500 + }, + { + "epoch": 0.3130406129005684, + "grad_norm": 7.639510631561279, + "learning_rate": 4.47826564516572e-05, + "loss": 3.0965, + "step": 1007000 + }, + { + "epoch": 0.3131960451810553, + "grad_norm": 8.758833885192871, + "learning_rate": 4.4780065913649086e-05, + "loss": 3.0483, + "step": 1007500 + }, + { + "epoch": 0.3133514774615422, + "grad_norm": 8.163594245910645, + "learning_rate": 4.4777475375640966e-05, + "loss": 3.0204, + "step": 1008000 + }, + { + "epoch": 0.31350690974202905, + "grad_norm": 22.8593807220459, + "learning_rate": 4.4774884837632854e-05, + "loss": 3.0458, + "step": 1008500 + }, + { + "epoch": 0.3136623420225159, + "grad_norm": 9.122150421142578, + "learning_rate": 4.4772294299624734e-05, + "loss": 3.0679, + "step": 1009000 + }, + { + "epoch": 0.3138177743030028, + "grad_norm": 19.60992431640625, + "learning_rate": 4.476970376161662e-05, + "loss": 3.077, + "step": 1009500 + }, + { + "epoch": 0.31397320658348965, + "grad_norm": 9.247125625610352, + "learning_rate": 4.476711322360851e-05, + "loss": 3.0335, + "step": 1010000 + }, + { + "epoch": 0.3141286388639766, + "grad_norm": 10.985102653503418, + "learning_rate": 4.476452268560039e-05, + "loss": 3.0556, + "step": 1010500 + }, + { + "epoch": 0.31428407114446344, + "grad_norm": 7.854257106781006, + "learning_rate": 4.4761932147592276e-05, + "loss": 3.0573, + "step": 1011000 + }, + { + "epoch": 0.3144395034249503, + "grad_norm": 8.039657592773438, + "learning_rate": 4.475934160958416e-05, + "loss": 3.0462, + "step": 1011500 + }, + { + "epoch": 0.3145949357054372, + "grad_norm": 5.060207366943359, + "learning_rate": 4.475675107157605e-05, + "loss": 3.0442, + "step": 1012000 + }, + { + "epoch": 0.31475036798592404, + "grad_norm": 9.297856330871582, + "learning_rate": 4.475416053356794e-05, + "loss": 3.0405, + "step": 1012500 + }, + { + "epoch": 0.3149058002664109, + "grad_norm": 6.733607769012451, + "learning_rate": 4.4751569995559824e-05, + "loss": 3.0161, + "step": 1013000 + }, + { + "epoch": 0.31506123254689783, + "grad_norm": 8.375140190124512, + "learning_rate": 4.4748979457551705e-05, + "loss": 3.0266, + "step": 1013500 + }, + { + "epoch": 0.3152166648273847, + "grad_norm": 11.017557144165039, + "learning_rate": 4.474638891954359e-05, + "loss": 3.0973, + "step": 1014000 + }, + { + "epoch": 0.31537209710787156, + "grad_norm": 11.160408973693848, + "learning_rate": 4.474379838153548e-05, + "loss": 3.0162, + "step": 1014500 + }, + { + "epoch": 0.31552752938835843, + "grad_norm": 7.715372562408447, + "learning_rate": 4.474120784352736e-05, + "loss": 3.0118, + "step": 1015000 + }, + { + "epoch": 0.3156829616688453, + "grad_norm": 19.515581130981445, + "learning_rate": 4.4738617305519247e-05, + "loss": 3.0425, + "step": 1015500 + }, + { + "epoch": 0.31583839394933216, + "grad_norm": 7.275699615478516, + "learning_rate": 4.473602676751113e-05, + "loss": 3.0788, + "step": 1016000 + }, + { + "epoch": 0.3159938262298191, + "grad_norm": 11.1048002243042, + "learning_rate": 4.4733436229503014e-05, + "loss": 3.0501, + "step": 1016500 + }, + { + "epoch": 0.31614925851030595, + "grad_norm": 7.670830726623535, + "learning_rate": 4.473084569149491e-05, + "loss": 3.0805, + "step": 1017000 + }, + { + "epoch": 0.3163046907907928, + "grad_norm": 31.45163917541504, + "learning_rate": 4.472825515348679e-05, + "loss": 3.0844, + "step": 1017500 + }, + { + "epoch": 0.3164601230712797, + "grad_norm": 8.460518836975098, + "learning_rate": 4.4725664615478676e-05, + "loss": 3.065, + "step": 1018000 + }, + { + "epoch": 0.31661555535176655, + "grad_norm": 8.765888214111328, + "learning_rate": 4.472307407747056e-05, + "loss": 3.0798, + "step": 1018500 + }, + { + "epoch": 0.3167709876322534, + "grad_norm": 8.743024826049805, + "learning_rate": 4.472048353946244e-05, + "loss": 3.0055, + "step": 1019000 + }, + { + "epoch": 0.31692641991274034, + "grad_norm": 6.628053188323975, + "learning_rate": 4.471789300145433e-05, + "loss": 3.0074, + "step": 1019500 + }, + { + "epoch": 0.3170818521932272, + "grad_norm": 8.215058326721191, + "learning_rate": 4.471530246344622e-05, + "loss": 3.0327, + "step": 1020000 + }, + { + "epoch": 0.3172372844737141, + "grad_norm": 11.649791717529297, + "learning_rate": 4.47127119254381e-05, + "loss": 3.0415, + "step": 1020500 + }, + { + "epoch": 0.31739271675420094, + "grad_norm": 8.439931869506836, + "learning_rate": 4.4710121387429985e-05, + "loss": 3.0724, + "step": 1021000 + }, + { + "epoch": 0.3175481490346878, + "grad_norm": 20.469236373901367, + "learning_rate": 4.470753084942187e-05, + "loss": 3.0192, + "step": 1021500 + }, + { + "epoch": 0.3177035813151747, + "grad_norm": 9.961064338684082, + "learning_rate": 4.470494031141376e-05, + "loss": 3.0896, + "step": 1022000 + }, + { + "epoch": 0.3178590135956616, + "grad_norm": 7.95452880859375, + "learning_rate": 4.4702349773405646e-05, + "loss": 3.0498, + "step": 1022500 + }, + { + "epoch": 0.31801444587614847, + "grad_norm": 8.619662284851074, + "learning_rate": 4.469975923539753e-05, + "loss": 3.0422, + "step": 1023000 + }, + { + "epoch": 0.31816987815663533, + "grad_norm": 12.30868148803711, + "learning_rate": 4.4697168697389414e-05, + "loss": 3.0212, + "step": 1023500 + }, + { + "epoch": 0.3183253104371222, + "grad_norm": 68.55529022216797, + "learning_rate": 4.46945781593813e-05, + "loss": 3.0616, + "step": 1024000 + }, + { + "epoch": 0.31848074271760907, + "grad_norm": 8.68040943145752, + "learning_rate": 4.469198762137318e-05, + "loss": 3.0814, + "step": 1024500 + }, + { + "epoch": 0.31863617499809593, + "grad_norm": 9.534733772277832, + "learning_rate": 4.468939708336507e-05, + "loss": 3.0231, + "step": 1025000 + }, + { + "epoch": 0.31879160727858286, + "grad_norm": 8.124999046325684, + "learning_rate": 4.4686806545356956e-05, + "loss": 3.0515, + "step": 1025500 + }, + { + "epoch": 0.3189470395590697, + "grad_norm": 9.341633796691895, + "learning_rate": 4.4684216007348836e-05, + "loss": 3.0354, + "step": 1026000 + }, + { + "epoch": 0.3191024718395566, + "grad_norm": 25.820777893066406, + "learning_rate": 4.468162546934072e-05, + "loss": 2.9962, + "step": 1026500 + }, + { + "epoch": 0.31925790412004346, + "grad_norm": 8.840651512145996, + "learning_rate": 4.467903493133261e-05, + "loss": 3.0274, + "step": 1027000 + }, + { + "epoch": 0.3194133364005303, + "grad_norm": 5.622989177703857, + "learning_rate": 4.46764443933245e-05, + "loss": 3.031, + "step": 1027500 + }, + { + "epoch": 0.3195687686810172, + "grad_norm": 6.743470191955566, + "learning_rate": 4.4673853855316385e-05, + "loss": 3.0573, + "step": 1028000 + }, + { + "epoch": 0.3197242009615041, + "grad_norm": 7.9302849769592285, + "learning_rate": 4.4671263317308265e-05, + "loss": 3.0805, + "step": 1028500 + }, + { + "epoch": 0.319879633241991, + "grad_norm": 6.901163101196289, + "learning_rate": 4.466867277930015e-05, + "loss": 3.1055, + "step": 1029000 + }, + { + "epoch": 0.32003506552247785, + "grad_norm": 9.522652626037598, + "learning_rate": 4.466608224129204e-05, + "loss": 3.0929, + "step": 1029500 + }, + { + "epoch": 0.3201904978029647, + "grad_norm": 15.94373607635498, + "learning_rate": 4.466349170328392e-05, + "loss": 3.0088, + "step": 1030000 + }, + { + "epoch": 0.3203459300834516, + "grad_norm": 16.212373733520508, + "learning_rate": 4.466090116527581e-05, + "loss": 3.0434, + "step": 1030500 + }, + { + "epoch": 0.32050136236393845, + "grad_norm": 8.548895835876465, + "learning_rate": 4.4658310627267694e-05, + "loss": 3.0007, + "step": 1031000 + }, + { + "epoch": 0.32065679464442537, + "grad_norm": 13.586828231811523, + "learning_rate": 4.465572008925958e-05, + "loss": 3.0759, + "step": 1031500 + }, + { + "epoch": 0.32081222692491224, + "grad_norm": 10.72857666015625, + "learning_rate": 4.465312955125147e-05, + "loss": 3.0318, + "step": 1032000 + }, + { + "epoch": 0.3209676592053991, + "grad_norm": 8.943646430969238, + "learning_rate": 4.4650539013243356e-05, + "loss": 3.0647, + "step": 1032500 + }, + { + "epoch": 0.32112309148588597, + "grad_norm": 19.235597610473633, + "learning_rate": 4.4647948475235236e-05, + "loss": 3.038, + "step": 1033000 + }, + { + "epoch": 0.32127852376637284, + "grad_norm": 7.146568298339844, + "learning_rate": 4.464535793722712e-05, + "loss": 3.0516, + "step": 1033500 + }, + { + "epoch": 0.3214339560468597, + "grad_norm": 12.088504791259766, + "learning_rate": 4.4642767399219003e-05, + "loss": 3.0701, + "step": 1034000 + }, + { + "epoch": 0.3215893883273466, + "grad_norm": 9.91869068145752, + "learning_rate": 4.464017686121089e-05, + "loss": 3.0559, + "step": 1034500 + }, + { + "epoch": 0.3217448206078335, + "grad_norm": 8.01838493347168, + "learning_rate": 4.463758632320278e-05, + "loss": 3.0496, + "step": 1035000 + }, + { + "epoch": 0.32190025288832036, + "grad_norm": 12.032242774963379, + "learning_rate": 4.463499578519466e-05, + "loss": 3.0537, + "step": 1035500 + }, + { + "epoch": 0.3220556851688072, + "grad_norm": 6.526250839233398, + "learning_rate": 4.4632405247186545e-05, + "loss": 3.0482, + "step": 1036000 + }, + { + "epoch": 0.3222111174492941, + "grad_norm": 6.626695156097412, + "learning_rate": 4.462981470917843e-05, + "loss": 3.0458, + "step": 1036500 + }, + { + "epoch": 0.32236654972978096, + "grad_norm": 7.453368663787842, + "learning_rate": 4.462722417117032e-05, + "loss": 3.0777, + "step": 1037000 + }, + { + "epoch": 0.3225219820102679, + "grad_norm": 9.009892463684082, + "learning_rate": 4.462463363316221e-05, + "loss": 3.0744, + "step": 1037500 + }, + { + "epoch": 0.32267741429075475, + "grad_norm": 7.147331237792969, + "learning_rate": 4.4622043095154094e-05, + "loss": 3.0212, + "step": 1038000 + }, + { + "epoch": 0.3228328465712416, + "grad_norm": 7.4974517822265625, + "learning_rate": 4.4619452557145974e-05, + "loss": 3.0251, + "step": 1038500 + }, + { + "epoch": 0.3229882788517285, + "grad_norm": 12.08255386352539, + "learning_rate": 4.461686201913786e-05, + "loss": 3.0574, + "step": 1039000 + }, + { + "epoch": 0.32314371113221535, + "grad_norm": 6.131585121154785, + "learning_rate": 4.461427148112974e-05, + "loss": 3.0393, + "step": 1039500 + }, + { + "epoch": 0.3232991434127022, + "grad_norm": 11.312551498413086, + "learning_rate": 4.461168094312163e-05, + "loss": 3.0299, + "step": 1040000 + }, + { + "epoch": 0.32345457569318914, + "grad_norm": 8.498376846313477, + "learning_rate": 4.4609090405113516e-05, + "loss": 3.0006, + "step": 1040500 + }, + { + "epoch": 0.323610007973676, + "grad_norm": 7.9380035400390625, + "learning_rate": 4.46064998671054e-05, + "loss": 3.0657, + "step": 1041000 + }, + { + "epoch": 0.3237654402541629, + "grad_norm": 8.371383666992188, + "learning_rate": 4.460390932909729e-05, + "loss": 3.046, + "step": 1041500 + }, + { + "epoch": 0.32392087253464974, + "grad_norm": 8.080942153930664, + "learning_rate": 4.460131879108918e-05, + "loss": 2.9984, + "step": 1042000 + }, + { + "epoch": 0.3240763048151366, + "grad_norm": 8.177578926086426, + "learning_rate": 4.459872825308106e-05, + "loss": 3.0338, + "step": 1042500 + }, + { + "epoch": 0.3242317370956235, + "grad_norm": 12.204699516296387, + "learning_rate": 4.4596137715072945e-05, + "loss": 3.0484, + "step": 1043000 + }, + { + "epoch": 0.3243871693761104, + "grad_norm": 9.1721773147583, + "learning_rate": 4.459354717706483e-05, + "loss": 3.0933, + "step": 1043500 + }, + { + "epoch": 0.32454260165659726, + "grad_norm": 8.283403396606445, + "learning_rate": 4.459095663905671e-05, + "loss": 3.0442, + "step": 1044000 + }, + { + "epoch": 0.32469803393708413, + "grad_norm": 8.833453178405762, + "learning_rate": 4.45883661010486e-05, + "loss": 3.0196, + "step": 1044500 + }, + { + "epoch": 0.324853466217571, + "grad_norm": 9.044563293457031, + "learning_rate": 4.458577556304048e-05, + "loss": 3.0787, + "step": 1045000 + }, + { + "epoch": 0.32500889849805786, + "grad_norm": 12.06692886352539, + "learning_rate": 4.458318502503237e-05, + "loss": 3.0648, + "step": 1045500 + }, + { + "epoch": 0.32516433077854473, + "grad_norm": 6.455106258392334, + "learning_rate": 4.4580594487024255e-05, + "loss": 3.0428, + "step": 1046000 + }, + { + "epoch": 0.32531976305903165, + "grad_norm": 9.592110633850098, + "learning_rate": 4.457800394901614e-05, + "loss": 3.0045, + "step": 1046500 + }, + { + "epoch": 0.3254751953395185, + "grad_norm": 7.011752605438232, + "learning_rate": 4.457541341100803e-05, + "loss": 3.0636, + "step": 1047000 + }, + { + "epoch": 0.3256306276200054, + "grad_norm": 8.115296363830566, + "learning_rate": 4.4572822872999916e-05, + "loss": 3.024, + "step": 1047500 + }, + { + "epoch": 0.32578605990049225, + "grad_norm": 5.6114983558654785, + "learning_rate": 4.4570232334991796e-05, + "loss": 3.0622, + "step": 1048000 + }, + { + "epoch": 0.3259414921809791, + "grad_norm": 7.301242351531982, + "learning_rate": 4.4567641796983683e-05, + "loss": 3.0262, + "step": 1048500 + }, + { + "epoch": 0.326096924461466, + "grad_norm": 6.5974884033203125, + "learning_rate": 4.456505125897557e-05, + "loss": 3.0463, + "step": 1049000 + }, + { + "epoch": 0.3262523567419529, + "grad_norm": 11.354157447814941, + "learning_rate": 4.456246072096745e-05, + "loss": 3.0433, + "step": 1049500 + }, + { + "epoch": 0.3264077890224398, + "grad_norm": 8.583755493164062, + "learning_rate": 4.455987018295934e-05, + "loss": 3.0504, + "step": 1050000 + }, + { + "epoch": 0.32656322130292664, + "grad_norm": 8.684221267700195, + "learning_rate": 4.4557279644951225e-05, + "loss": 3.0633, + "step": 1050500 + }, + { + "epoch": 0.3267186535834135, + "grad_norm": 7.55294132232666, + "learning_rate": 4.455468910694311e-05, + "loss": 3.0078, + "step": 1051000 + }, + { + "epoch": 0.3268740858639004, + "grad_norm": 9.10940933227539, + "learning_rate": 4.4552098568935e-05, + "loss": 3.04, + "step": 1051500 + }, + { + "epoch": 0.32702951814438724, + "grad_norm": 8.631683349609375, + "learning_rate": 4.454950803092688e-05, + "loss": 3.0289, + "step": 1052000 + }, + { + "epoch": 0.32718495042487417, + "grad_norm": 10.214914321899414, + "learning_rate": 4.454691749291877e-05, + "loss": 3.0204, + "step": 1052500 + }, + { + "epoch": 0.32734038270536103, + "grad_norm": 15.380921363830566, + "learning_rate": 4.4544326954910654e-05, + "loss": 3.0234, + "step": 1053000 + }, + { + "epoch": 0.3274958149858479, + "grad_norm": 9.439560890197754, + "learning_rate": 4.4541736416902535e-05, + "loss": 3.0046, + "step": 1053500 + }, + { + "epoch": 0.32765124726633477, + "grad_norm": 7.541168212890625, + "learning_rate": 4.453914587889442e-05, + "loss": 3.0317, + "step": 1054000 + }, + { + "epoch": 0.32780667954682163, + "grad_norm": 20.98183822631836, + "learning_rate": 4.453655534088631e-05, + "loss": 3.0457, + "step": 1054500 + }, + { + "epoch": 0.3279621118273085, + "grad_norm": 8.685282707214355, + "learning_rate": 4.453396480287819e-05, + "loss": 3.0803, + "step": 1055000 + }, + { + "epoch": 0.3281175441077954, + "grad_norm": 8.747264862060547, + "learning_rate": 4.4531374264870077e-05, + "loss": 3.0605, + "step": 1055500 + }, + { + "epoch": 0.3282729763882823, + "grad_norm": 17.56719398498535, + "learning_rate": 4.4528783726861964e-05, + "loss": 3.0563, + "step": 1056000 + }, + { + "epoch": 0.32842840866876916, + "grad_norm": 8.631624221801758, + "learning_rate": 4.452619318885385e-05, + "loss": 3.1148, + "step": 1056500 + }, + { + "epoch": 0.328583840949256, + "grad_norm": 9.004417419433594, + "learning_rate": 4.452360265084574e-05, + "loss": 3.0335, + "step": 1057000 + }, + { + "epoch": 0.3287392732297429, + "grad_norm": 8.238847732543945, + "learning_rate": 4.452101211283762e-05, + "loss": 3.0589, + "step": 1057500 + }, + { + "epoch": 0.32889470551022976, + "grad_norm": 9.102740287780762, + "learning_rate": 4.4518421574829506e-05, + "loss": 3.0572, + "step": 1058000 + }, + { + "epoch": 0.3290501377907167, + "grad_norm": 8.880905151367188, + "learning_rate": 4.451583103682139e-05, + "loss": 3.0571, + "step": 1058500 + }, + { + "epoch": 0.32920557007120355, + "grad_norm": 7.409605026245117, + "learning_rate": 4.451324049881327e-05, + "loss": 3.019, + "step": 1059000 + }, + { + "epoch": 0.3293610023516904, + "grad_norm": 8.348042488098145, + "learning_rate": 4.451064996080516e-05, + "loss": 3.0674, + "step": 1059500 + }, + { + "epoch": 0.3295164346321773, + "grad_norm": 11.12561321258545, + "learning_rate": 4.450805942279705e-05, + "loss": 3.0805, + "step": 1060000 + }, + { + "epoch": 0.32967186691266415, + "grad_norm": 8.788908958435059, + "learning_rate": 4.4505468884788935e-05, + "loss": 3.0174, + "step": 1060500 + }, + { + "epoch": 0.329827299193151, + "grad_norm": 7.280161380767822, + "learning_rate": 4.450287834678082e-05, + "loss": 3.0117, + "step": 1061000 + }, + { + "epoch": 0.32998273147363794, + "grad_norm": 19.305030822753906, + "learning_rate": 4.450028780877271e-05, + "loss": 3.0703, + "step": 1061500 + }, + { + "epoch": 0.3301381637541248, + "grad_norm": 8.008371353149414, + "learning_rate": 4.449769727076459e-05, + "loss": 3.0845, + "step": 1062000 + }, + { + "epoch": 0.33029359603461167, + "grad_norm": 6.077874660491943, + "learning_rate": 4.4495106732756476e-05, + "loss": 3.0326, + "step": 1062500 + }, + { + "epoch": 0.33044902831509854, + "grad_norm": 7.449452877044678, + "learning_rate": 4.4492516194748364e-05, + "loss": 3.0511, + "step": 1063000 + }, + { + "epoch": 0.3306044605955854, + "grad_norm": 8.549592971801758, + "learning_rate": 4.4489925656740244e-05, + "loss": 3.0178, + "step": 1063500 + }, + { + "epoch": 0.33075989287607227, + "grad_norm": 15.652875900268555, + "learning_rate": 4.448733511873213e-05, + "loss": 3.0627, + "step": 1064000 + }, + { + "epoch": 0.33091532515655914, + "grad_norm": 7.132636070251465, + "learning_rate": 4.448474458072401e-05, + "loss": 3.0548, + "step": 1064500 + }, + { + "epoch": 0.33107075743704606, + "grad_norm": 18.031147003173828, + "learning_rate": 4.44821540427159e-05, + "loss": 3.0898, + "step": 1065000 + }, + { + "epoch": 0.3312261897175329, + "grad_norm": 8.555007934570312, + "learning_rate": 4.4479563504707786e-05, + "loss": 2.9787, + "step": 1065500 + }, + { + "epoch": 0.3313816219980198, + "grad_norm": 11.087206840515137, + "learning_rate": 4.447697296669967e-05, + "loss": 3.0773, + "step": 1066000 + }, + { + "epoch": 0.33153705427850666, + "grad_norm": 9.022427558898926, + "learning_rate": 4.447438242869156e-05, + "loss": 3.0106, + "step": 1066500 + }, + { + "epoch": 0.3316924865589935, + "grad_norm": 9.025009155273438, + "learning_rate": 4.447179189068345e-05, + "loss": 3.032, + "step": 1067000 + }, + { + "epoch": 0.3318479188394804, + "grad_norm": 8.76302433013916, + "learning_rate": 4.446920135267533e-05, + "loss": 3.0174, + "step": 1067500 + }, + { + "epoch": 0.3320033511199673, + "grad_norm": 7.449962615966797, + "learning_rate": 4.4466610814667215e-05, + "loss": 3.0295, + "step": 1068000 + }, + { + "epoch": 0.3321587834004542, + "grad_norm": 8.387984275817871, + "learning_rate": 4.44640202766591e-05, + "loss": 3.0819, + "step": 1068500 + }, + { + "epoch": 0.33231421568094105, + "grad_norm": 22.121999740600586, + "learning_rate": 4.446142973865098e-05, + "loss": 3.0531, + "step": 1069000 + }, + { + "epoch": 0.3324696479614279, + "grad_norm": 6.4817891120910645, + "learning_rate": 4.445883920064287e-05, + "loss": 3.083, + "step": 1069500 + }, + { + "epoch": 0.3326250802419148, + "grad_norm": 8.456538200378418, + "learning_rate": 4.445624866263475e-05, + "loss": 3.0026, + "step": 1070000 + }, + { + "epoch": 0.33278051252240165, + "grad_norm": 22.206018447875977, + "learning_rate": 4.4453658124626644e-05, + "loss": 3.0817, + "step": 1070500 + }, + { + "epoch": 0.3329359448028886, + "grad_norm": 7.397619247436523, + "learning_rate": 4.445106758661853e-05, + "loss": 3.0142, + "step": 1071000 + }, + { + "epoch": 0.33309137708337544, + "grad_norm": 5.92830228805542, + "learning_rate": 4.444847704861041e-05, + "loss": 3.0303, + "step": 1071500 + }, + { + "epoch": 0.3332468093638623, + "grad_norm": 9.124751091003418, + "learning_rate": 4.44458865106023e-05, + "loss": 3.0957, + "step": 1072000 + }, + { + "epoch": 0.3334022416443492, + "grad_norm": 9.374452590942383, + "learning_rate": 4.4443295972594186e-05, + "loss": 3.0601, + "step": 1072500 + }, + { + "epoch": 0.33355767392483604, + "grad_norm": 10.169293403625488, + "learning_rate": 4.4440705434586066e-05, + "loss": 3.0245, + "step": 1073000 + }, + { + "epoch": 0.3337131062053229, + "grad_norm": 9.092777252197266, + "learning_rate": 4.443811489657795e-05, + "loss": 3.045, + "step": 1073500 + }, + { + "epoch": 0.33386853848580983, + "grad_norm": 14.731879234313965, + "learning_rate": 4.443552435856984e-05, + "loss": 3.0439, + "step": 1074000 + }, + { + "epoch": 0.3340239707662967, + "grad_norm": 8.121811866760254, + "learning_rate": 4.443293382056172e-05, + "loss": 3.0038, + "step": 1074500 + }, + { + "epoch": 0.33417940304678356, + "grad_norm": 6.889671802520752, + "learning_rate": 4.443034328255361e-05, + "loss": 3.0798, + "step": 1075000 + }, + { + "epoch": 0.33433483532727043, + "grad_norm": 9.773881912231445, + "learning_rate": 4.4427752744545495e-05, + "loss": 3.0147, + "step": 1075500 + }, + { + "epoch": 0.3344902676077573, + "grad_norm": 7.681098937988281, + "learning_rate": 4.442516220653738e-05, + "loss": 3.0717, + "step": 1076000 + }, + { + "epoch": 0.33464569988824416, + "grad_norm": 8.56230640411377, + "learning_rate": 4.442257166852927e-05, + "loss": 3.0546, + "step": 1076500 + }, + { + "epoch": 0.3348011321687311, + "grad_norm": 12.260516166687012, + "learning_rate": 4.441998113052115e-05, + "loss": 3.0618, + "step": 1077000 + }, + { + "epoch": 0.33495656444921795, + "grad_norm": 33.320072174072266, + "learning_rate": 4.441739059251304e-05, + "loss": 3.0386, + "step": 1077500 + }, + { + "epoch": 0.3351119967297048, + "grad_norm": 8.573784828186035, + "learning_rate": 4.4414800054504924e-05, + "loss": 3.0301, + "step": 1078000 + }, + { + "epoch": 0.3352674290101917, + "grad_norm": 7.361313343048096, + "learning_rate": 4.4412209516496804e-05, + "loss": 3.0966, + "step": 1078500 + }, + { + "epoch": 0.33542286129067855, + "grad_norm": 7.600867748260498, + "learning_rate": 4.440961897848869e-05, + "loss": 3.0486, + "step": 1079000 + }, + { + "epoch": 0.3355782935711654, + "grad_norm": 8.864408493041992, + "learning_rate": 4.440702844048058e-05, + "loss": 3.0455, + "step": 1079500 + }, + { + "epoch": 0.33573372585165234, + "grad_norm": 9.291287422180176, + "learning_rate": 4.440443790247246e-05, + "loss": 3.0397, + "step": 1080000 + }, + { + "epoch": 0.3358891581321392, + "grad_norm": 9.424667358398438, + "learning_rate": 4.440184736446435e-05, + "loss": 3.0299, + "step": 1080500 + }, + { + "epoch": 0.3360445904126261, + "grad_norm": 8.631013870239258, + "learning_rate": 4.439925682645624e-05, + "loss": 3.0361, + "step": 1081000 + }, + { + "epoch": 0.33620002269311294, + "grad_norm": 8.644669532775879, + "learning_rate": 4.439666628844812e-05, + "loss": 2.9907, + "step": 1081500 + }, + { + "epoch": 0.3363554549735998, + "grad_norm": 41.64324188232422, + "learning_rate": 4.439407575044001e-05, + "loss": 3.022, + "step": 1082000 + }, + { + "epoch": 0.3365108872540867, + "grad_norm": 8.022873878479004, + "learning_rate": 4.439148521243189e-05, + "loss": 3.0666, + "step": 1082500 + }, + { + "epoch": 0.3366663195345736, + "grad_norm": 12.609173774719238, + "learning_rate": 4.4388894674423775e-05, + "loss": 3.0563, + "step": 1083000 + }, + { + "epoch": 0.33682175181506047, + "grad_norm": 6.639911651611328, + "learning_rate": 4.438630413641566e-05, + "loss": 3.0698, + "step": 1083500 + }, + { + "epoch": 0.33697718409554733, + "grad_norm": 12.850539207458496, + "learning_rate": 4.438371359840754e-05, + "loss": 3.0514, + "step": 1084000 + }, + { + "epoch": 0.3371326163760342, + "grad_norm": 8.228989601135254, + "learning_rate": 4.438112306039943e-05, + "loss": 3.0397, + "step": 1084500 + }, + { + "epoch": 0.33728804865652107, + "grad_norm": 8.850831985473633, + "learning_rate": 4.437853252239132e-05, + "loss": 3.0754, + "step": 1085000 + }, + { + "epoch": 0.33744348093700793, + "grad_norm": 9.393580436706543, + "learning_rate": 4.4375941984383204e-05, + "loss": 3.0269, + "step": 1085500 + }, + { + "epoch": 0.33759891321749486, + "grad_norm": 6.9835052490234375, + "learning_rate": 4.437335144637509e-05, + "loss": 3.1052, + "step": 1086000 + }, + { + "epoch": 0.3377543454979817, + "grad_norm": 8.012137413024902, + "learning_rate": 4.437076090836698e-05, + "loss": 3.0591, + "step": 1086500 + }, + { + "epoch": 0.3379097777784686, + "grad_norm": 8.355034828186035, + "learning_rate": 4.436817037035886e-05, + "loss": 3.031, + "step": 1087000 + }, + { + "epoch": 0.33806521005895546, + "grad_norm": 12.463892936706543, + "learning_rate": 4.4365579832350746e-05, + "loss": 3.0457, + "step": 1087500 + }, + { + "epoch": 0.3382206423394423, + "grad_norm": 19.24110984802246, + "learning_rate": 4.4362989294342626e-05, + "loss": 3.0418, + "step": 1088000 + }, + { + "epoch": 0.3383760746199292, + "grad_norm": 16.34261131286621, + "learning_rate": 4.4360398756334513e-05, + "loss": 3.0721, + "step": 1088500 + }, + { + "epoch": 0.3385315069004161, + "grad_norm": 8.477856636047363, + "learning_rate": 4.43578082183264e-05, + "loss": 3.0072, + "step": 1089000 + }, + { + "epoch": 0.338686939180903, + "grad_norm": 15.854722023010254, + "learning_rate": 4.435521768031828e-05, + "loss": 3.0111, + "step": 1089500 + }, + { + "epoch": 0.33884237146138985, + "grad_norm": 8.007303237915039, + "learning_rate": 4.435262714231017e-05, + "loss": 3.0465, + "step": 1090000 + }, + { + "epoch": 0.3389978037418767, + "grad_norm": 14.618905067443848, + "learning_rate": 4.435003660430206e-05, + "loss": 3.0184, + "step": 1090500 + }, + { + "epoch": 0.3391532360223636, + "grad_norm": 10.537778854370117, + "learning_rate": 4.434744606629394e-05, + "loss": 3.0611, + "step": 1091000 + }, + { + "epoch": 0.33930866830285045, + "grad_norm": 13.056293487548828, + "learning_rate": 4.434485552828583e-05, + "loss": 3.0105, + "step": 1091500 + }, + { + "epoch": 0.33946410058333737, + "grad_norm": 8.018132209777832, + "learning_rate": 4.434226499027772e-05, + "loss": 3.0252, + "step": 1092000 + }, + { + "epoch": 0.33961953286382424, + "grad_norm": 8.05031967163086, + "learning_rate": 4.43396744522696e-05, + "loss": 3.0246, + "step": 1092500 + }, + { + "epoch": 0.3397749651443111, + "grad_norm": 7.9388933181762695, + "learning_rate": 4.4337083914261484e-05, + "loss": 3.0466, + "step": 1093000 + }, + { + "epoch": 0.33993039742479797, + "grad_norm": 9.609416961669922, + "learning_rate": 4.4334493376253365e-05, + "loss": 3.0404, + "step": 1093500 + }, + { + "epoch": 0.34008582970528484, + "grad_norm": 7.570411205291748, + "learning_rate": 4.433190283824525e-05, + "loss": 3.0108, + "step": 1094000 + }, + { + "epoch": 0.3402412619857717, + "grad_norm": 6.939256191253662, + "learning_rate": 4.432931230023714e-05, + "loss": 3.011, + "step": 1094500 + }, + { + "epoch": 0.3403966942662586, + "grad_norm": 7.16133975982666, + "learning_rate": 4.4326721762229026e-05, + "loss": 3.0289, + "step": 1095000 + }, + { + "epoch": 0.3405521265467455, + "grad_norm": 11.100675582885742, + "learning_rate": 4.432413122422091e-05, + "loss": 3.0359, + "step": 1095500 + }, + { + "epoch": 0.34070755882723236, + "grad_norm": 39.531333923339844, + "learning_rate": 4.43215406862128e-05, + "loss": 3.0848, + "step": 1096000 + }, + { + "epoch": 0.3408629911077192, + "grad_norm": 7.5349602699279785, + "learning_rate": 4.431895014820468e-05, + "loss": 3.0467, + "step": 1096500 + }, + { + "epoch": 0.3410184233882061, + "grad_norm": 6.830658435821533, + "learning_rate": 4.431635961019657e-05, + "loss": 3.0735, + "step": 1097000 + }, + { + "epoch": 0.34117385566869296, + "grad_norm": 8.47252368927002, + "learning_rate": 4.4313769072188455e-05, + "loss": 3.0275, + "step": 1097500 + }, + { + "epoch": 0.3413292879491799, + "grad_norm": 17.348407745361328, + "learning_rate": 4.4311178534180335e-05, + "loss": 3.0463, + "step": 1098000 + }, + { + "epoch": 0.34148472022966675, + "grad_norm": 7.902219772338867, + "learning_rate": 4.430858799617222e-05, + "loss": 3.0396, + "step": 1098500 + }, + { + "epoch": 0.3416401525101536, + "grad_norm": 7.849472522735596, + "learning_rate": 4.430599745816411e-05, + "loss": 3.0225, + "step": 1099000 + }, + { + "epoch": 0.3417955847906405, + "grad_norm": 9.952588081359863, + "learning_rate": 4.430340692015599e-05, + "loss": 3.0199, + "step": 1099500 + }, + { + "epoch": 0.34195101707112735, + "grad_norm": 10.199652671813965, + "learning_rate": 4.430081638214788e-05, + "loss": 3.0619, + "step": 1100000 + }, + { + "epoch": 0.3421064493516142, + "grad_norm": 7.7986297607421875, + "learning_rate": 4.4298225844139764e-05, + "loss": 3.0501, + "step": 1100500 + }, + { + "epoch": 0.34226188163210114, + "grad_norm": 7.115299701690674, + "learning_rate": 4.429563530613165e-05, + "loss": 3.0488, + "step": 1101000 + }, + { + "epoch": 0.342417313912588, + "grad_norm": 7.883884906768799, + "learning_rate": 4.429304476812354e-05, + "loss": 3.0485, + "step": 1101500 + }, + { + "epoch": 0.3425727461930749, + "grad_norm": 10.760358810424805, + "learning_rate": 4.429045423011542e-05, + "loss": 3.0631, + "step": 1102000 + }, + { + "epoch": 0.34272817847356174, + "grad_norm": 10.42679500579834, + "learning_rate": 4.4287863692107306e-05, + "loss": 3.0178, + "step": 1102500 + }, + { + "epoch": 0.3428836107540486, + "grad_norm": 8.168416023254395, + "learning_rate": 4.4285273154099193e-05, + "loss": 3.0409, + "step": 1103000 + }, + { + "epoch": 0.3430390430345355, + "grad_norm": 8.344667434692383, + "learning_rate": 4.4282682616091074e-05, + "loss": 3.0347, + "step": 1103500 + }, + { + "epoch": 0.3431944753150224, + "grad_norm": 7.391808032989502, + "learning_rate": 4.428009207808296e-05, + "loss": 3.0258, + "step": 1104000 + }, + { + "epoch": 0.34334990759550926, + "grad_norm": 7.418473720550537, + "learning_rate": 4.427750154007485e-05, + "loss": 3.0608, + "step": 1104500 + }, + { + "epoch": 0.34350533987599613, + "grad_norm": 6.96732234954834, + "learning_rate": 4.4274911002066735e-05, + "loss": 3.0824, + "step": 1105000 + }, + { + "epoch": 0.343660772156483, + "grad_norm": 10.968291282653809, + "learning_rate": 4.427232046405862e-05, + "loss": 3.0489, + "step": 1105500 + }, + { + "epoch": 0.34381620443696986, + "grad_norm": 9.134038925170898, + "learning_rate": 4.42697299260505e-05, + "loss": 3.0232, + "step": 1106000 + }, + { + "epoch": 0.34397163671745673, + "grad_norm": 8.117990493774414, + "learning_rate": 4.426713938804239e-05, + "loss": 3.0606, + "step": 1106500 + }, + { + "epoch": 0.34412706899794365, + "grad_norm": 8.925477027893066, + "learning_rate": 4.426454885003428e-05, + "loss": 2.9978, + "step": 1107000 + }, + { + "epoch": 0.3442825012784305, + "grad_norm": 7.416977882385254, + "learning_rate": 4.426195831202616e-05, + "loss": 3.0847, + "step": 1107500 + }, + { + "epoch": 0.3444379335589174, + "grad_norm": 8.05455493927002, + "learning_rate": 4.4259367774018045e-05, + "loss": 3.0327, + "step": 1108000 + }, + { + "epoch": 0.34459336583940425, + "grad_norm": 6.868621826171875, + "learning_rate": 4.425677723600993e-05, + "loss": 3.0175, + "step": 1108500 + }, + { + "epoch": 0.3447487981198911, + "grad_norm": 9.860404968261719, + "learning_rate": 4.425418669800181e-05, + "loss": 3.0802, + "step": 1109000 + }, + { + "epoch": 0.344904230400378, + "grad_norm": 7.146548748016357, + "learning_rate": 4.42515961599937e-05, + "loss": 3.0421, + "step": 1109500 + }, + { + "epoch": 0.3450596626808649, + "grad_norm": 11.79326343536377, + "learning_rate": 4.4249005621985587e-05, + "loss": 2.9891, + "step": 1110000 + }, + { + "epoch": 0.3452150949613518, + "grad_norm": 7.328494071960449, + "learning_rate": 4.4246415083977474e-05, + "loss": 3.0427, + "step": 1110500 + }, + { + "epoch": 0.34537052724183864, + "grad_norm": 10.607451438903809, + "learning_rate": 4.424382454596936e-05, + "loss": 3.041, + "step": 1111000 + }, + { + "epoch": 0.3455259595223255, + "grad_norm": 7.746840000152588, + "learning_rate": 4.424123400796124e-05, + "loss": 3.0274, + "step": 1111500 + }, + { + "epoch": 0.3456813918028124, + "grad_norm": 14.365445137023926, + "learning_rate": 4.423864346995313e-05, + "loss": 3.0009, + "step": 1112000 + }, + { + "epoch": 0.34583682408329924, + "grad_norm": 8.435098648071289, + "learning_rate": 4.4236052931945016e-05, + "loss": 3.0698, + "step": 1112500 + }, + { + "epoch": 0.34599225636378617, + "grad_norm": 8.14403247833252, + "learning_rate": 4.4233462393936896e-05, + "loss": 3.0648, + "step": 1113000 + }, + { + "epoch": 0.34614768864427303, + "grad_norm": 7.151078224182129, + "learning_rate": 4.423087185592878e-05, + "loss": 3.0266, + "step": 1113500 + }, + { + "epoch": 0.3463031209247599, + "grad_norm": 8.78545093536377, + "learning_rate": 4.422828131792067e-05, + "loss": 3.1183, + "step": 1114000 + }, + { + "epoch": 0.34645855320524677, + "grad_norm": 8.589553833007812, + "learning_rate": 4.422569077991256e-05, + "loss": 3.0349, + "step": 1114500 + }, + { + "epoch": 0.34661398548573363, + "grad_norm": 9.439984321594238, + "learning_rate": 4.4223100241904444e-05, + "loss": 3.0527, + "step": 1115000 + }, + { + "epoch": 0.3467694177662205, + "grad_norm": 10.9592924118042, + "learning_rate": 4.422050970389633e-05, + "loss": 3.0351, + "step": 1115500 + }, + { + "epoch": 0.3469248500467074, + "grad_norm": 10.982809066772461, + "learning_rate": 4.421791916588821e-05, + "loss": 3.0461, + "step": 1116000 + }, + { + "epoch": 0.3470802823271943, + "grad_norm": 7.830629825592041, + "learning_rate": 4.42153286278801e-05, + "loss": 3.0386, + "step": 1116500 + }, + { + "epoch": 0.34723571460768116, + "grad_norm": 8.546788215637207, + "learning_rate": 4.4212738089871986e-05, + "loss": 3.041, + "step": 1117000 + }, + { + "epoch": 0.347391146888168, + "grad_norm": 15.89035415649414, + "learning_rate": 4.421014755186387e-05, + "loss": 3.0651, + "step": 1117500 + }, + { + "epoch": 0.3475465791686549, + "grad_norm": 9.387702941894531, + "learning_rate": 4.4207557013855754e-05, + "loss": 3.0444, + "step": 1118000 + }, + { + "epoch": 0.34770201144914176, + "grad_norm": 6.35434103012085, + "learning_rate": 4.4204966475847634e-05, + "loss": 3.0044, + "step": 1118500 + }, + { + "epoch": 0.3478574437296287, + "grad_norm": 7.060772895812988, + "learning_rate": 4.420237593783952e-05, + "loss": 3.0465, + "step": 1119000 + }, + { + "epoch": 0.34801287601011555, + "grad_norm": 7.0443644523620605, + "learning_rate": 4.419978539983141e-05, + "loss": 3.0825, + "step": 1119500 + }, + { + "epoch": 0.3481683082906024, + "grad_norm": 8.010613441467285, + "learning_rate": 4.4197194861823296e-05, + "loss": 3.0308, + "step": 1120000 + }, + { + "epoch": 0.3483237405710893, + "grad_norm": 8.973621368408203, + "learning_rate": 4.419460432381518e-05, + "loss": 3.0012, + "step": 1120500 + }, + { + "epoch": 0.34847917285157615, + "grad_norm": 9.036954879760742, + "learning_rate": 4.419201378580707e-05, + "loss": 3.034, + "step": 1121000 + }, + { + "epoch": 0.348634605132063, + "grad_norm": 7.795248031616211, + "learning_rate": 4.418942324779895e-05, + "loss": 3.0412, + "step": 1121500 + }, + { + "epoch": 0.34879003741254994, + "grad_norm": 8.602263450622559, + "learning_rate": 4.418683270979084e-05, + "loss": 3.0262, + "step": 1122000 + }, + { + "epoch": 0.3489454696930368, + "grad_norm": 7.961662292480469, + "learning_rate": 4.4184242171782725e-05, + "loss": 3.0195, + "step": 1122500 + }, + { + "epoch": 0.34910090197352367, + "grad_norm": 4.898029804229736, + "learning_rate": 4.4181651633774605e-05, + "loss": 3.0238, + "step": 1123000 + }, + { + "epoch": 0.34925633425401054, + "grad_norm": 9.964293479919434, + "learning_rate": 4.417906109576649e-05, + "loss": 3.0718, + "step": 1123500 + }, + { + "epoch": 0.3494117665344974, + "grad_norm": 7.8206281661987305, + "learning_rate": 4.417647055775837e-05, + "loss": 3.0124, + "step": 1124000 + }, + { + "epoch": 0.34956719881498427, + "grad_norm": 8.389216423034668, + "learning_rate": 4.4173880019750267e-05, + "loss": 3.0233, + "step": 1124500 + }, + { + "epoch": 0.3497226310954712, + "grad_norm": 12.151901245117188, + "learning_rate": 4.4171289481742154e-05, + "loss": 3.0266, + "step": 1125000 + }, + { + "epoch": 0.34987806337595806, + "grad_norm": 7.415958881378174, + "learning_rate": 4.4168698943734034e-05, + "loss": 2.987, + "step": 1125500 + }, + { + "epoch": 0.3500334956564449, + "grad_norm": 5.554433345794678, + "learning_rate": 4.416610840572592e-05, + "loss": 3.0766, + "step": 1126000 + }, + { + "epoch": 0.3501889279369318, + "grad_norm": 9.601375579833984, + "learning_rate": 4.416351786771781e-05, + "loss": 3.055, + "step": 1126500 + }, + { + "epoch": 0.35034436021741866, + "grad_norm": 10.393735885620117, + "learning_rate": 4.416092732970969e-05, + "loss": 3.0435, + "step": 1127000 + }, + { + "epoch": 0.3504997924979055, + "grad_norm": 9.211447715759277, + "learning_rate": 4.4158336791701576e-05, + "loss": 3.0263, + "step": 1127500 + }, + { + "epoch": 0.35065522477839245, + "grad_norm": 57.898826599121094, + "learning_rate": 4.415574625369346e-05, + "loss": 2.989, + "step": 1128000 + }, + { + "epoch": 0.3508106570588793, + "grad_norm": 9.544525146484375, + "learning_rate": 4.4153155715685343e-05, + "loss": 3.0606, + "step": 1128500 + }, + { + "epoch": 0.3509660893393662, + "grad_norm": 29.677330017089844, + "learning_rate": 4.415056517767723e-05, + "loss": 2.9856, + "step": 1129000 + }, + { + "epoch": 0.35112152161985305, + "grad_norm": 8.0321683883667, + "learning_rate": 4.414797463966912e-05, + "loss": 3.0513, + "step": 1129500 + }, + { + "epoch": 0.3512769539003399, + "grad_norm": 9.103964805603027, + "learning_rate": 4.4145384101661005e-05, + "loss": 3.0723, + "step": 1130000 + }, + { + "epoch": 0.3514323861808268, + "grad_norm": 7.957248687744141, + "learning_rate": 4.414279356365289e-05, + "loss": 3.0357, + "step": 1130500 + }, + { + "epoch": 0.3515878184613137, + "grad_norm": 6.548687934875488, + "learning_rate": 4.414020302564477e-05, + "loss": 3.0201, + "step": 1131000 + }, + { + "epoch": 0.3517432507418006, + "grad_norm": 9.753606796264648, + "learning_rate": 4.413761248763666e-05, + "loss": 3.0498, + "step": 1131500 + }, + { + "epoch": 0.35189868302228744, + "grad_norm": 11.266825675964355, + "learning_rate": 4.413502194962855e-05, + "loss": 3.0561, + "step": 1132000 + }, + { + "epoch": 0.3520541153027743, + "grad_norm": 8.823127746582031, + "learning_rate": 4.413243141162043e-05, + "loss": 3.0848, + "step": 1132500 + }, + { + "epoch": 0.3522095475832612, + "grad_norm": 8.931831359863281, + "learning_rate": 4.4129840873612314e-05, + "loss": 3.0438, + "step": 1133000 + }, + { + "epoch": 0.35236497986374804, + "grad_norm": 6.424753189086914, + "learning_rate": 4.41272503356042e-05, + "loss": 3.0464, + "step": 1133500 + }, + { + "epoch": 0.35252041214423496, + "grad_norm": 10.458122253417969, + "learning_rate": 4.412465979759608e-05, + "loss": 3.0208, + "step": 1134000 + }, + { + "epoch": 0.35267584442472183, + "grad_norm": 11.94097900390625, + "learning_rate": 4.4122069259587976e-05, + "loss": 3.0691, + "step": 1134500 + }, + { + "epoch": 0.3528312767052087, + "grad_norm": 8.303802490234375, + "learning_rate": 4.411947872157986e-05, + "loss": 3.0414, + "step": 1135000 + }, + { + "epoch": 0.35298670898569556, + "grad_norm": 14.643795013427734, + "learning_rate": 4.411688818357174e-05, + "loss": 3.0637, + "step": 1135500 + }, + { + "epoch": 0.35314214126618243, + "grad_norm": 7.172959327697754, + "learning_rate": 4.411429764556363e-05, + "loss": 3.0202, + "step": 1136000 + }, + { + "epoch": 0.3532975735466693, + "grad_norm": 16.38578987121582, + "learning_rate": 4.411170710755551e-05, + "loss": 2.9969, + "step": 1136500 + }, + { + "epoch": 0.3534530058271562, + "grad_norm": 8.202418327331543, + "learning_rate": 4.41091165695474e-05, + "loss": 2.9764, + "step": 1137000 + }, + { + "epoch": 0.3536084381076431, + "grad_norm": 47.34149932861328, + "learning_rate": 4.4106526031539285e-05, + "loss": 3.057, + "step": 1137500 + }, + { + "epoch": 0.35376387038812995, + "grad_norm": 6.779778480529785, + "learning_rate": 4.4103935493531165e-05, + "loss": 3.0246, + "step": 1138000 + }, + { + "epoch": 0.3539193026686168, + "grad_norm": 7.5239434242248535, + "learning_rate": 4.410134495552305e-05, + "loss": 3.0236, + "step": 1138500 + }, + { + "epoch": 0.3540747349491037, + "grad_norm": 15.239812850952148, + "learning_rate": 4.409875441751494e-05, + "loss": 3.0409, + "step": 1139000 + }, + { + "epoch": 0.35423016722959055, + "grad_norm": 8.879515647888184, + "learning_rate": 4.409616387950683e-05, + "loss": 3.0192, + "step": 1139500 + }, + { + "epoch": 0.3543855995100775, + "grad_norm": 8.126730918884277, + "learning_rate": 4.4093573341498714e-05, + "loss": 2.9745, + "step": 1140000 + }, + { + "epoch": 0.35454103179056434, + "grad_norm": 9.221545219421387, + "learning_rate": 4.40909828034906e-05, + "loss": 3.0435, + "step": 1140500 + }, + { + "epoch": 0.3546964640710512, + "grad_norm": 7.0149335861206055, + "learning_rate": 4.408839226548248e-05, + "loss": 3.0575, + "step": 1141000 + }, + { + "epoch": 0.3548518963515381, + "grad_norm": 8.534178733825684, + "learning_rate": 4.408580172747437e-05, + "loss": 3.0436, + "step": 1141500 + }, + { + "epoch": 0.35500732863202494, + "grad_norm": 7.714475631713867, + "learning_rate": 4.408321118946625e-05, + "loss": 3.0052, + "step": 1142000 + }, + { + "epoch": 0.3551627609125118, + "grad_norm": 8.079378128051758, + "learning_rate": 4.4080620651458136e-05, + "loss": 3.0116, + "step": 1142500 + }, + { + "epoch": 0.35531819319299873, + "grad_norm": 36.29160690307617, + "learning_rate": 4.4078030113450023e-05, + "loss": 3.0594, + "step": 1143000 + }, + { + "epoch": 0.3554736254734856, + "grad_norm": 9.321693420410156, + "learning_rate": 4.4075439575441904e-05, + "loss": 3.0636, + "step": 1143500 + }, + { + "epoch": 0.35562905775397247, + "grad_norm": 8.67084789276123, + "learning_rate": 4.407284903743379e-05, + "loss": 3.0143, + "step": 1144000 + }, + { + "epoch": 0.35578449003445933, + "grad_norm": 6.271648406982422, + "learning_rate": 4.4070258499425685e-05, + "loss": 3.0239, + "step": 1144500 + }, + { + "epoch": 0.3559399223149462, + "grad_norm": 6.421993732452393, + "learning_rate": 4.4067667961417565e-05, + "loss": 3.0124, + "step": 1145000 + }, + { + "epoch": 0.35609535459543307, + "grad_norm": 7.23102331161499, + "learning_rate": 4.406507742340945e-05, + "loss": 3.0325, + "step": 1145500 + }, + { + "epoch": 0.35625078687592, + "grad_norm": 6.303463459014893, + "learning_rate": 4.406248688540134e-05, + "loss": 3.0763, + "step": 1146000 + }, + { + "epoch": 0.35640621915640686, + "grad_norm": 7.379007339477539, + "learning_rate": 4.405989634739322e-05, + "loss": 3.0033, + "step": 1146500 + }, + { + "epoch": 0.3565616514368937, + "grad_norm": 7.594627857208252, + "learning_rate": 4.405730580938511e-05, + "loss": 3.0388, + "step": 1147000 + }, + { + "epoch": 0.3567170837173806, + "grad_norm": 7.736152648925781, + "learning_rate": 4.405471527137699e-05, + "loss": 3.0467, + "step": 1147500 + }, + { + "epoch": 0.35687251599786746, + "grad_norm": 9.053566932678223, + "learning_rate": 4.4052124733368875e-05, + "loss": 3.0587, + "step": 1148000 + }, + { + "epoch": 0.3570279482783543, + "grad_norm": 7.48646354675293, + "learning_rate": 4.404953419536076e-05, + "loss": 3.0292, + "step": 1148500 + }, + { + "epoch": 0.35718338055884125, + "grad_norm": 7.00542688369751, + "learning_rate": 4.404694365735265e-05, + "loss": 2.9851, + "step": 1149000 + }, + { + "epoch": 0.3573388128393281, + "grad_norm": 9.369489669799805, + "learning_rate": 4.4044353119344536e-05, + "loss": 3.0306, + "step": 1149500 + }, + { + "epoch": 0.357494245119815, + "grad_norm": 8.771810531616211, + "learning_rate": 4.404176258133642e-05, + "loss": 3.0657, + "step": 1150000 + }, + { + "epoch": 0.35764967740030185, + "grad_norm": 7.611443996429443, + "learning_rate": 4.4039172043328304e-05, + "loss": 3.0598, + "step": 1150500 + }, + { + "epoch": 0.3578051096807887, + "grad_norm": 23.8966121673584, + "learning_rate": 4.403658150532019e-05, + "loss": 3.085, + "step": 1151000 + }, + { + "epoch": 0.3579605419612756, + "grad_norm": 12.525221824645996, + "learning_rate": 4.403399096731208e-05, + "loss": 3.0169, + "step": 1151500 + }, + { + "epoch": 0.3581159742417625, + "grad_norm": 8.031381607055664, + "learning_rate": 4.403140042930396e-05, + "loss": 3.0256, + "step": 1152000 + }, + { + "epoch": 0.35827140652224937, + "grad_norm": 23.880704879760742, + "learning_rate": 4.4028809891295845e-05, + "loss": 3.0803, + "step": 1152500 + }, + { + "epoch": 0.35842683880273624, + "grad_norm": 12.824649810791016, + "learning_rate": 4.402621935328773e-05, + "loss": 3.0156, + "step": 1153000 + }, + { + "epoch": 0.3585822710832231, + "grad_norm": 7.39301061630249, + "learning_rate": 4.402362881527961e-05, + "loss": 2.9914, + "step": 1153500 + }, + { + "epoch": 0.35873770336370997, + "grad_norm": 8.20748519897461, + "learning_rate": 4.40210382772715e-05, + "loss": 3.077, + "step": 1154000 + }, + { + "epoch": 0.35889313564419684, + "grad_norm": 8.850237846374512, + "learning_rate": 4.401844773926339e-05, + "loss": 3.038, + "step": 1154500 + }, + { + "epoch": 0.35904856792468376, + "grad_norm": 11.670211791992188, + "learning_rate": 4.4015857201255274e-05, + "loss": 3.0449, + "step": 1155000 + }, + { + "epoch": 0.3592040002051706, + "grad_norm": 7.615373611450195, + "learning_rate": 4.401326666324716e-05, + "loss": 3.0503, + "step": 1155500 + }, + { + "epoch": 0.3593594324856575, + "grad_norm": 9.646461486816406, + "learning_rate": 4.401067612523904e-05, + "loss": 3.0806, + "step": 1156000 + }, + { + "epoch": 0.35951486476614436, + "grad_norm": 8.219464302062988, + "learning_rate": 4.400808558723093e-05, + "loss": 3.0638, + "step": 1156500 + }, + { + "epoch": 0.3596702970466312, + "grad_norm": 8.709843635559082, + "learning_rate": 4.4005495049222816e-05, + "loss": 3.0364, + "step": 1157000 + }, + { + "epoch": 0.3598257293271181, + "grad_norm": 9.88741397857666, + "learning_rate": 4.40029045112147e-05, + "loss": 3.025, + "step": 1157500 + }, + { + "epoch": 0.359981161607605, + "grad_norm": 9.980294227600098, + "learning_rate": 4.4000313973206584e-05, + "loss": 3.049, + "step": 1158000 + }, + { + "epoch": 0.3601365938880919, + "grad_norm": 8.743345260620117, + "learning_rate": 4.399772343519847e-05, + "loss": 3.0538, + "step": 1158500 + }, + { + "epoch": 0.36029202616857875, + "grad_norm": 9.856380462646484, + "learning_rate": 4.399513289719036e-05, + "loss": 3.0224, + "step": 1159000 + }, + { + "epoch": 0.3604474584490656, + "grad_norm": 11.280341148376465, + "learning_rate": 4.3992542359182245e-05, + "loss": 3.0564, + "step": 1159500 + }, + { + "epoch": 0.3606028907295525, + "grad_norm": 11.704926490783691, + "learning_rate": 4.3989951821174126e-05, + "loss": 2.9971, + "step": 1160000 + }, + { + "epoch": 0.36075832301003935, + "grad_norm": 8.382568359375, + "learning_rate": 4.398736128316601e-05, + "loss": 2.9971, + "step": 1160500 + }, + { + "epoch": 0.3609137552905263, + "grad_norm": 17.42222785949707, + "learning_rate": 4.39847707451579e-05, + "loss": 3.0372, + "step": 1161000 + }, + { + "epoch": 0.36106918757101314, + "grad_norm": 8.723426818847656, + "learning_rate": 4.398218020714978e-05, + "loss": 3.0202, + "step": 1161500 + }, + { + "epoch": 0.3612246198515, + "grad_norm": 9.192254066467285, + "learning_rate": 4.397958966914167e-05, + "loss": 3.0646, + "step": 1162000 + }, + { + "epoch": 0.3613800521319869, + "grad_norm": 6.19033670425415, + "learning_rate": 4.3976999131133555e-05, + "loss": 3.0045, + "step": 1162500 + }, + { + "epoch": 0.36153548441247374, + "grad_norm": 9.64167308807373, + "learning_rate": 4.3974408593125435e-05, + "loss": 3.03, + "step": 1163000 + }, + { + "epoch": 0.3616909166929606, + "grad_norm": 9.683025360107422, + "learning_rate": 4.397181805511732e-05, + "loss": 3.0659, + "step": 1163500 + }, + { + "epoch": 0.36184634897344753, + "grad_norm": 9.546847343444824, + "learning_rate": 4.396922751710921e-05, + "loss": 2.9967, + "step": 1164000 + }, + { + "epoch": 0.3620017812539344, + "grad_norm": 7.4772467613220215, + "learning_rate": 4.3966636979101096e-05, + "loss": 3.0517, + "step": 1164500 + }, + { + "epoch": 0.36215721353442126, + "grad_norm": 8.12006950378418, + "learning_rate": 4.3964046441092984e-05, + "loss": 3.0127, + "step": 1165000 + }, + { + "epoch": 0.36231264581490813, + "grad_norm": 9.371198654174805, + "learning_rate": 4.3961455903084864e-05, + "loss": 2.978, + "step": 1165500 + }, + { + "epoch": 0.362468078095395, + "grad_norm": 8.01396369934082, + "learning_rate": 4.395886536507675e-05, + "loss": 3.0582, + "step": 1166000 + }, + { + "epoch": 0.36262351037588186, + "grad_norm": 6.104931831359863, + "learning_rate": 4.395627482706864e-05, + "loss": 3.0663, + "step": 1166500 + }, + { + "epoch": 0.3627789426563688, + "grad_norm": 17.07562828063965, + "learning_rate": 4.395368428906052e-05, + "loss": 3.0054, + "step": 1167000 + }, + { + "epoch": 0.36293437493685565, + "grad_norm": 17.373783111572266, + "learning_rate": 4.3951093751052406e-05, + "loss": 3.0427, + "step": 1167500 + }, + { + "epoch": 0.3630898072173425, + "grad_norm": 28.18296241760254, + "learning_rate": 4.394850321304429e-05, + "loss": 3.0187, + "step": 1168000 + }, + { + "epoch": 0.3632452394978294, + "grad_norm": 7.132038116455078, + "learning_rate": 4.394591267503618e-05, + "loss": 3.0121, + "step": 1168500 + }, + { + "epoch": 0.36340067177831625, + "grad_norm": 9.966873168945312, + "learning_rate": 4.394332213702807e-05, + "loss": 3.0337, + "step": 1169000 + }, + { + "epoch": 0.3635561040588031, + "grad_norm": 6.515743732452393, + "learning_rate": 4.3940731599019954e-05, + "loss": 3.0133, + "step": 1169500 + }, + { + "epoch": 0.36371153633929004, + "grad_norm": 9.340173721313477, + "learning_rate": 4.3938141061011835e-05, + "loss": 3.0521, + "step": 1170000 + }, + { + "epoch": 0.3638669686197769, + "grad_norm": 9.225554466247559, + "learning_rate": 4.393555052300372e-05, + "loss": 3.0133, + "step": 1170500 + }, + { + "epoch": 0.3640224009002638, + "grad_norm": 19.727205276489258, + "learning_rate": 4.393295998499561e-05, + "loss": 3.0925, + "step": 1171000 + }, + { + "epoch": 0.36417783318075064, + "grad_norm": 18.835498809814453, + "learning_rate": 4.393036944698749e-05, + "loss": 3.0698, + "step": 1171500 + }, + { + "epoch": 0.3643332654612375, + "grad_norm": 10.788559913635254, + "learning_rate": 4.392777890897938e-05, + "loss": 3.0035, + "step": 1172000 + }, + { + "epoch": 0.3644886977417244, + "grad_norm": 7.401065349578857, + "learning_rate": 4.392518837097126e-05, + "loss": 3.09, + "step": 1172500 + }, + { + "epoch": 0.3646441300222113, + "grad_norm": 8.814059257507324, + "learning_rate": 4.3922597832963144e-05, + "loss": 3.0449, + "step": 1173000 + }, + { + "epoch": 0.36479956230269817, + "grad_norm": 7.800710678100586, + "learning_rate": 4.392000729495503e-05, + "loss": 2.9929, + "step": 1173500 + }, + { + "epoch": 0.36495499458318503, + "grad_norm": 8.885250091552734, + "learning_rate": 4.391741675694692e-05, + "loss": 3.0176, + "step": 1174000 + }, + { + "epoch": 0.3651104268636719, + "grad_norm": 9.354445457458496, + "learning_rate": 4.3914826218938806e-05, + "loss": 3.0392, + "step": 1174500 + }, + { + "epoch": 0.36526585914415877, + "grad_norm": 7.981881618499756, + "learning_rate": 4.391223568093069e-05, + "loss": 3.0548, + "step": 1175000 + }, + { + "epoch": 0.36542129142464563, + "grad_norm": 6.590839862823486, + "learning_rate": 4.390964514292257e-05, + "loss": 3.0199, + "step": 1175500 + }, + { + "epoch": 0.36557672370513256, + "grad_norm": 10.566217422485352, + "learning_rate": 4.390705460491446e-05, + "loss": 3.0472, + "step": 1176000 + }, + { + "epoch": 0.3657321559856194, + "grad_norm": 8.704360961914062, + "learning_rate": 4.390446406690635e-05, + "loss": 3.0649, + "step": 1176500 + }, + { + "epoch": 0.3658875882661063, + "grad_norm": 7.641165733337402, + "learning_rate": 4.390187352889823e-05, + "loss": 3.0457, + "step": 1177000 + }, + { + "epoch": 0.36604302054659316, + "grad_norm": 8.08686351776123, + "learning_rate": 4.3899282990890115e-05, + "loss": 3.0416, + "step": 1177500 + }, + { + "epoch": 0.36619845282708, + "grad_norm": 13.087449073791504, + "learning_rate": 4.3896692452882e-05, + "loss": 3.0206, + "step": 1178000 + }, + { + "epoch": 0.3663538851075669, + "grad_norm": 8.697279930114746, + "learning_rate": 4.389410191487389e-05, + "loss": 3.0077, + "step": 1178500 + }, + { + "epoch": 0.3665093173880538, + "grad_norm": 6.841578483581543, + "learning_rate": 4.3891511376865777e-05, + "loss": 2.995, + "step": 1179000 + }, + { + "epoch": 0.3666647496685407, + "grad_norm": 7.12892484664917, + "learning_rate": 4.388892083885766e-05, + "loss": 3.0474, + "step": 1179500 + }, + { + "epoch": 0.36682018194902755, + "grad_norm": 6.268676280975342, + "learning_rate": 4.3886330300849544e-05, + "loss": 3.0298, + "step": 1180000 + }, + { + "epoch": 0.3669756142295144, + "grad_norm": 7.882168769836426, + "learning_rate": 4.388373976284143e-05, + "loss": 2.9901, + "step": 1180500 + }, + { + "epoch": 0.3671310465100013, + "grad_norm": 9.133962631225586, + "learning_rate": 4.388114922483331e-05, + "loss": 3.0322, + "step": 1181000 + }, + { + "epoch": 0.36728647879048815, + "grad_norm": 15.69717788696289, + "learning_rate": 4.38785586868252e-05, + "loss": 3.0177, + "step": 1181500 + }, + { + "epoch": 0.36744191107097507, + "grad_norm": 7.6475138664245605, + "learning_rate": 4.3875968148817086e-05, + "loss": 3.0606, + "step": 1182000 + }, + { + "epoch": 0.36759734335146194, + "grad_norm": 7.546743869781494, + "learning_rate": 4.3873377610808966e-05, + "loss": 3.0495, + "step": 1182500 + }, + { + "epoch": 0.3677527756319488, + "grad_norm": 8.86502456665039, + "learning_rate": 4.3870787072800853e-05, + "loss": 3.014, + "step": 1183000 + }, + { + "epoch": 0.36790820791243567, + "grad_norm": 8.725776672363281, + "learning_rate": 4.386819653479274e-05, + "loss": 3.093, + "step": 1183500 + }, + { + "epoch": 0.36806364019292254, + "grad_norm": 9.44382095336914, + "learning_rate": 4.386560599678463e-05, + "loss": 3.0507, + "step": 1184000 + }, + { + "epoch": 0.3682190724734094, + "grad_norm": 7.685892105102539, + "learning_rate": 4.3863015458776515e-05, + "loss": 3.0132, + "step": 1184500 + }, + { + "epoch": 0.3683745047538963, + "grad_norm": 7.183826446533203, + "learning_rate": 4.3860424920768395e-05, + "loss": 3.0558, + "step": 1185000 + }, + { + "epoch": 0.3685299370343832, + "grad_norm": 7.972708225250244, + "learning_rate": 4.385783438276028e-05, + "loss": 2.9741, + "step": 1185500 + }, + { + "epoch": 0.36868536931487006, + "grad_norm": 19.799562454223633, + "learning_rate": 4.385524384475217e-05, + "loss": 3.0344, + "step": 1186000 + }, + { + "epoch": 0.3688408015953569, + "grad_norm": 8.267668724060059, + "learning_rate": 4.385265330674405e-05, + "loss": 3.013, + "step": 1186500 + }, + { + "epoch": 0.3689962338758438, + "grad_norm": 7.9019389152526855, + "learning_rate": 4.385006276873594e-05, + "loss": 3.0201, + "step": 1187000 + }, + { + "epoch": 0.36915166615633066, + "grad_norm": 12.652122497558594, + "learning_rate": 4.3847472230727824e-05, + "loss": 3.0148, + "step": 1187500 + }, + { + "epoch": 0.3693070984368176, + "grad_norm": 5.3524041175842285, + "learning_rate": 4.384488169271971e-05, + "loss": 3.0207, + "step": 1188000 + }, + { + "epoch": 0.36946253071730445, + "grad_norm": 7.382601261138916, + "learning_rate": 4.38422911547116e-05, + "loss": 2.9894, + "step": 1188500 + }, + { + "epoch": 0.3696179629977913, + "grad_norm": 8.412457466125488, + "learning_rate": 4.3839700616703486e-05, + "loss": 2.9926, + "step": 1189000 + }, + { + "epoch": 0.3697733952782782, + "grad_norm": 9.104368209838867, + "learning_rate": 4.3837110078695366e-05, + "loss": 3.0052, + "step": 1189500 + }, + { + "epoch": 0.36992882755876505, + "grad_norm": 22.744033813476562, + "learning_rate": 4.383451954068725e-05, + "loss": 3.0417, + "step": 1190000 + }, + { + "epoch": 0.3700842598392519, + "grad_norm": 9.296847343444824, + "learning_rate": 4.3831929002679134e-05, + "loss": 3.0877, + "step": 1190500 + }, + { + "epoch": 0.37023969211973884, + "grad_norm": 7.019800186157227, + "learning_rate": 4.382933846467102e-05, + "loss": 3.0029, + "step": 1191000 + }, + { + "epoch": 0.3703951244002257, + "grad_norm": 6.683697700500488, + "learning_rate": 4.382674792666291e-05, + "loss": 3.0303, + "step": 1191500 + }, + { + "epoch": 0.3705505566807126, + "grad_norm": 7.539988994598389, + "learning_rate": 4.382415738865479e-05, + "loss": 2.9908, + "step": 1192000 + }, + { + "epoch": 0.37070598896119944, + "grad_norm": 7.909057140350342, + "learning_rate": 4.3821566850646675e-05, + "loss": 3.0587, + "step": 1192500 + }, + { + "epoch": 0.3708614212416863, + "grad_norm": 8.320595741271973, + "learning_rate": 4.381897631263856e-05, + "loss": 3.0479, + "step": 1193000 + }, + { + "epoch": 0.3710168535221732, + "grad_norm": 11.757220268249512, + "learning_rate": 4.381638577463045e-05, + "loss": 3.0128, + "step": 1193500 + }, + { + "epoch": 0.3711722858026601, + "grad_norm": 11.508705139160156, + "learning_rate": 4.381379523662234e-05, + "loss": 2.9975, + "step": 1194000 + }, + { + "epoch": 0.37132771808314696, + "grad_norm": 22.288097381591797, + "learning_rate": 4.3811204698614224e-05, + "loss": 3.0151, + "step": 1194500 + }, + { + "epoch": 0.37148315036363383, + "grad_norm": 18.388858795166016, + "learning_rate": 4.3808614160606104e-05, + "loss": 3.0395, + "step": 1195000 + }, + { + "epoch": 0.3716385826441207, + "grad_norm": 12.257899284362793, + "learning_rate": 4.380602362259799e-05, + "loss": 3.022, + "step": 1195500 + }, + { + "epoch": 0.37179401492460756, + "grad_norm": 8.94349479675293, + "learning_rate": 4.380343308458987e-05, + "loss": 2.9882, + "step": 1196000 + }, + { + "epoch": 0.37194944720509443, + "grad_norm": 7.69416618347168, + "learning_rate": 4.380084254658176e-05, + "loss": 3.0502, + "step": 1196500 + }, + { + "epoch": 0.3721048794855813, + "grad_norm": 9.843779563903809, + "learning_rate": 4.3798252008573646e-05, + "loss": 3.0254, + "step": 1197000 + }, + { + "epoch": 0.3722603117660682, + "grad_norm": 8.834566116333008, + "learning_rate": 4.379566147056553e-05, + "loss": 3.0289, + "step": 1197500 + }, + { + "epoch": 0.3724157440465551, + "grad_norm": 25.976566314697266, + "learning_rate": 4.379307093255742e-05, + "loss": 3.0119, + "step": 1198000 + }, + { + "epoch": 0.37257117632704195, + "grad_norm": 7.352529525756836, + "learning_rate": 4.379048039454931e-05, + "loss": 3.0574, + "step": 1198500 + }, + { + "epoch": 0.3727266086075288, + "grad_norm": 8.063488960266113, + "learning_rate": 4.378788985654119e-05, + "loss": 3.0518, + "step": 1199000 + }, + { + "epoch": 0.3728820408880157, + "grad_norm": 6.663394451141357, + "learning_rate": 4.3785299318533075e-05, + "loss": 3.0345, + "step": 1199500 + }, + { + "epoch": 0.37303747316850255, + "grad_norm": 8.111998558044434, + "learning_rate": 4.378270878052496e-05, + "loss": 3.003, + "step": 1200000 + }, + { + "epoch": 0.3731929054489895, + "grad_norm": 8.883503913879395, + "learning_rate": 4.378011824251684e-05, + "loss": 3.0192, + "step": 1200500 + }, + { + "epoch": 0.37334833772947634, + "grad_norm": 10.596620559692383, + "learning_rate": 4.377752770450873e-05, + "loss": 3.0437, + "step": 1201000 + }, + { + "epoch": 0.3735037700099632, + "grad_norm": 5.229915142059326, + "learning_rate": 4.377493716650062e-05, + "loss": 3.0083, + "step": 1201500 + }, + { + "epoch": 0.3736592022904501, + "grad_norm": 7.371228218078613, + "learning_rate": 4.37723466284925e-05, + "loss": 3.0123, + "step": 1202000 + }, + { + "epoch": 0.37381463457093694, + "grad_norm": 6.009052753448486, + "learning_rate": 4.3769756090484385e-05, + "loss": 2.9888, + "step": 1202500 + }, + { + "epoch": 0.3739700668514238, + "grad_norm": 7.801084995269775, + "learning_rate": 4.376716555247627e-05, + "loss": 3.026, + "step": 1203000 + }, + { + "epoch": 0.37412549913191073, + "grad_norm": 9.037374496459961, + "learning_rate": 4.376457501446816e-05, + "loss": 3.0467, + "step": 1203500 + }, + { + "epoch": 0.3742809314123976, + "grad_norm": 9.340840339660645, + "learning_rate": 4.3761984476460046e-05, + "loss": 3.0513, + "step": 1204000 + }, + { + "epoch": 0.37443636369288447, + "grad_norm": 7.152156352996826, + "learning_rate": 4.3759393938451926e-05, + "loss": 3.0503, + "step": 1204500 + }, + { + "epoch": 0.37459179597337133, + "grad_norm": 12.458781242370605, + "learning_rate": 4.3756803400443814e-05, + "loss": 3.0142, + "step": 1205000 + }, + { + "epoch": 0.3747472282538582, + "grad_norm": 6.6986188888549805, + "learning_rate": 4.37542128624357e-05, + "loss": 3.0005, + "step": 1205500 + }, + { + "epoch": 0.37490266053434507, + "grad_norm": 9.747681617736816, + "learning_rate": 4.375162232442758e-05, + "loss": 3.016, + "step": 1206000 + }, + { + "epoch": 0.375058092814832, + "grad_norm": 8.600428581237793, + "learning_rate": 4.374903178641947e-05, + "loss": 3.0188, + "step": 1206500 + }, + { + "epoch": 0.37521352509531886, + "grad_norm": 9.992462158203125, + "learning_rate": 4.3746441248411355e-05, + "loss": 3.0477, + "step": 1207000 + }, + { + "epoch": 0.3753689573758057, + "grad_norm": 6.418580532073975, + "learning_rate": 4.3743850710403236e-05, + "loss": 2.9656, + "step": 1207500 + }, + { + "epoch": 0.3755243896562926, + "grad_norm": 8.17013168334961, + "learning_rate": 4.374126017239513e-05, + "loss": 3.0043, + "step": 1208000 + }, + { + "epoch": 0.37567982193677946, + "grad_norm": 8.434198379516602, + "learning_rate": 4.373866963438701e-05, + "loss": 3.067, + "step": 1208500 + }, + { + "epoch": 0.3758352542172663, + "grad_norm": 7.775270462036133, + "learning_rate": 4.37360790963789e-05, + "loss": 3.0442, + "step": 1209000 + }, + { + "epoch": 0.37599068649775325, + "grad_norm": 7.154195308685303, + "learning_rate": 4.3733488558370784e-05, + "loss": 3.0037, + "step": 1209500 + }, + { + "epoch": 0.3761461187782401, + "grad_norm": 7.940889835357666, + "learning_rate": 4.3730898020362665e-05, + "loss": 3.004, + "step": 1210000 + }, + { + "epoch": 0.376301551058727, + "grad_norm": 10.056808471679688, + "learning_rate": 4.372830748235455e-05, + "loss": 3.037, + "step": 1210500 + }, + { + "epoch": 0.37645698333921385, + "grad_norm": 8.817445755004883, + "learning_rate": 4.372571694434644e-05, + "loss": 3.023, + "step": 1211000 + }, + { + "epoch": 0.3766124156197007, + "grad_norm": 6.8785810470581055, + "learning_rate": 4.372312640633832e-05, + "loss": 3.0263, + "step": 1211500 + }, + { + "epoch": 0.3767678479001876, + "grad_norm": 8.913065910339355, + "learning_rate": 4.372053586833021e-05, + "loss": 3.0123, + "step": 1212000 + }, + { + "epoch": 0.3769232801806745, + "grad_norm": 7.463315963745117, + "learning_rate": 4.3717945330322094e-05, + "loss": 3.0278, + "step": 1212500 + }, + { + "epoch": 0.37707871246116137, + "grad_norm": 8.209603309631348, + "learning_rate": 4.371535479231398e-05, + "loss": 2.9884, + "step": 1213000 + }, + { + "epoch": 0.37723414474164824, + "grad_norm": 12.07111930847168, + "learning_rate": 4.371276425430587e-05, + "loss": 3.0396, + "step": 1213500 + }, + { + "epoch": 0.3773895770221351, + "grad_norm": 9.527508735656738, + "learning_rate": 4.371017371629775e-05, + "loss": 3.0482, + "step": 1214000 + }, + { + "epoch": 0.37754500930262197, + "grad_norm": 10.062179565429688, + "learning_rate": 4.3707583178289636e-05, + "loss": 3.0286, + "step": 1214500 + }, + { + "epoch": 0.37770044158310884, + "grad_norm": 8.302620887756348, + "learning_rate": 4.370499264028152e-05, + "loss": 3.0215, + "step": 1215000 + }, + { + "epoch": 0.37785587386359576, + "grad_norm": 10.0821533203125, + "learning_rate": 4.37024021022734e-05, + "loss": 3.0467, + "step": 1215500 + }, + { + "epoch": 0.3780113061440826, + "grad_norm": 5.9731621742248535, + "learning_rate": 4.369981156426529e-05, + "loss": 3.0186, + "step": 1216000 + }, + { + "epoch": 0.3781667384245695, + "grad_norm": 10.062200546264648, + "learning_rate": 4.369722102625718e-05, + "loss": 2.9976, + "step": 1216500 + }, + { + "epoch": 0.37832217070505636, + "grad_norm": 8.01588249206543, + "learning_rate": 4.369463048824906e-05, + "loss": 3.0146, + "step": 1217000 + }, + { + "epoch": 0.3784776029855432, + "grad_norm": 22.307950973510742, + "learning_rate": 4.3692039950240945e-05, + "loss": 3.043, + "step": 1217500 + }, + { + "epoch": 0.3786330352660301, + "grad_norm": 5.837404251098633, + "learning_rate": 4.368944941223284e-05, + "loss": 3.025, + "step": 1218000 + }, + { + "epoch": 0.378788467546517, + "grad_norm": 12.44270133972168, + "learning_rate": 4.368685887422472e-05, + "loss": 3.004, + "step": 1218500 + }, + { + "epoch": 0.3789438998270039, + "grad_norm": 8.757488250732422, + "learning_rate": 4.3684268336216606e-05, + "loss": 2.9878, + "step": 1219000 + }, + { + "epoch": 0.37909933210749075, + "grad_norm": 6.830891132354736, + "learning_rate": 4.3681677798208494e-05, + "loss": 3.013, + "step": 1219500 + }, + { + "epoch": 0.3792547643879776, + "grad_norm": 8.742871284484863, + "learning_rate": 4.3679087260200374e-05, + "loss": 3.0104, + "step": 1220000 + }, + { + "epoch": 0.3794101966684645, + "grad_norm": 13.966339111328125, + "learning_rate": 4.367649672219226e-05, + "loss": 3.0266, + "step": 1220500 + }, + { + "epoch": 0.37956562894895135, + "grad_norm": 6.825784683227539, + "learning_rate": 4.367390618418414e-05, + "loss": 2.9763, + "step": 1221000 + }, + { + "epoch": 0.3797210612294383, + "grad_norm": 7.606245040893555, + "learning_rate": 4.367131564617603e-05, + "loss": 3.0388, + "step": 1221500 + }, + { + "epoch": 0.37987649350992514, + "grad_norm": 10.624388694763184, + "learning_rate": 4.3668725108167916e-05, + "loss": 3.0754, + "step": 1222000 + }, + { + "epoch": 0.380031925790412, + "grad_norm": 9.104411125183105, + "learning_rate": 4.36661345701598e-05, + "loss": 3.027, + "step": 1222500 + }, + { + "epoch": 0.3801873580708989, + "grad_norm": 8.698908805847168, + "learning_rate": 4.366354403215169e-05, + "loss": 3.0465, + "step": 1223000 + }, + { + "epoch": 0.38034279035138574, + "grad_norm": 6.419732093811035, + "learning_rate": 4.366095349414358e-05, + "loss": 3.0461, + "step": 1223500 + }, + { + "epoch": 0.3804982226318726, + "grad_norm": 8.975924491882324, + "learning_rate": 4.365836295613546e-05, + "loss": 2.9905, + "step": 1224000 + }, + { + "epoch": 0.38065365491235953, + "grad_norm": 7.855982780456543, + "learning_rate": 4.3655772418127345e-05, + "loss": 3.0235, + "step": 1224500 + }, + { + "epoch": 0.3808090871928464, + "grad_norm": 6.778043270111084, + "learning_rate": 4.365318188011923e-05, + "loss": 3.0317, + "step": 1225000 + }, + { + "epoch": 0.38096451947333326, + "grad_norm": 8.120940208435059, + "learning_rate": 4.365059134211111e-05, + "loss": 3.0148, + "step": 1225500 + }, + { + "epoch": 0.38111995175382013, + "grad_norm": 8.0519437789917, + "learning_rate": 4.3648000804103e-05, + "loss": 3.0301, + "step": 1226000 + }, + { + "epoch": 0.381275384034307, + "grad_norm": 11.034692764282227, + "learning_rate": 4.364541026609488e-05, + "loss": 2.9932, + "step": 1226500 + }, + { + "epoch": 0.38143081631479386, + "grad_norm": 10.132370948791504, + "learning_rate": 4.364281972808677e-05, + "loss": 3.0315, + "step": 1227000 + }, + { + "epoch": 0.3815862485952808, + "grad_norm": 9.784975051879883, + "learning_rate": 4.3640229190078654e-05, + "loss": 3.0265, + "step": 1227500 + }, + { + "epoch": 0.38174168087576765, + "grad_norm": 7.636826038360596, + "learning_rate": 4.363763865207054e-05, + "loss": 3.0393, + "step": 1228000 + }, + { + "epoch": 0.3818971131562545, + "grad_norm": 6.201311111450195, + "learning_rate": 4.363504811406243e-05, + "loss": 3.0118, + "step": 1228500 + }, + { + "epoch": 0.3820525454367414, + "grad_norm": 10.57809829711914, + "learning_rate": 4.3632457576054316e-05, + "loss": 2.961, + "step": 1229000 + }, + { + "epoch": 0.38220797771722825, + "grad_norm": 8.159257888793945, + "learning_rate": 4.3629867038046196e-05, + "loss": 2.9975, + "step": 1229500 + }, + { + "epoch": 0.3823634099977151, + "grad_norm": 7.5762434005737305, + "learning_rate": 4.362727650003808e-05, + "loss": 2.9902, + "step": 1230000 + }, + { + "epoch": 0.38251884227820204, + "grad_norm": 8.109339714050293, + "learning_rate": 4.362468596202997e-05, + "loss": 2.9806, + "step": 1230500 + }, + { + "epoch": 0.3826742745586889, + "grad_norm": 7.088109016418457, + "learning_rate": 4.362209542402185e-05, + "loss": 2.992, + "step": 1231000 + }, + { + "epoch": 0.3828297068391758, + "grad_norm": 7.802616119384766, + "learning_rate": 4.361950488601374e-05, + "loss": 3.0582, + "step": 1231500 + }, + { + "epoch": 0.38298513911966264, + "grad_norm": 8.055475234985352, + "learning_rate": 4.3616914348005625e-05, + "loss": 3.0129, + "step": 1232000 + }, + { + "epoch": 0.3831405714001495, + "grad_norm": 9.491019248962402, + "learning_rate": 4.361432380999751e-05, + "loss": 3.0133, + "step": 1232500 + }, + { + "epoch": 0.3832960036806364, + "grad_norm": 8.761237144470215, + "learning_rate": 4.36117332719894e-05, + "loss": 2.9815, + "step": 1233000 + }, + { + "epoch": 0.3834514359611233, + "grad_norm": 7.236588478088379, + "learning_rate": 4.360914273398128e-05, + "loss": 3.0522, + "step": 1233500 + }, + { + "epoch": 0.38360686824161017, + "grad_norm": 9.874594688415527, + "learning_rate": 4.360655219597317e-05, + "loss": 3.0369, + "step": 1234000 + }, + { + "epoch": 0.38376230052209703, + "grad_norm": 8.02299690246582, + "learning_rate": 4.3603961657965054e-05, + "loss": 2.9995, + "step": 1234500 + }, + { + "epoch": 0.3839177328025839, + "grad_norm": 9.731297492980957, + "learning_rate": 4.3601371119956934e-05, + "loss": 3.0342, + "step": 1235000 + }, + { + "epoch": 0.38407316508307077, + "grad_norm": 10.01852798461914, + "learning_rate": 4.359878058194882e-05, + "loss": 2.9921, + "step": 1235500 + }, + { + "epoch": 0.38422859736355763, + "grad_norm": 12.51931095123291, + "learning_rate": 4.359619004394071e-05, + "loss": 3.0216, + "step": 1236000 + }, + { + "epoch": 0.38438402964404456, + "grad_norm": 9.30201530456543, + "learning_rate": 4.359359950593259e-05, + "loss": 3.0365, + "step": 1236500 + }, + { + "epoch": 0.3845394619245314, + "grad_norm": 9.188941955566406, + "learning_rate": 4.3591008967924476e-05, + "loss": 3.0261, + "step": 1237000 + }, + { + "epoch": 0.3846948942050183, + "grad_norm": 13.325172424316406, + "learning_rate": 4.3588418429916363e-05, + "loss": 3.0151, + "step": 1237500 + }, + { + "epoch": 0.38485032648550516, + "grad_norm": 7.687941551208496, + "learning_rate": 4.358582789190825e-05, + "loss": 3.0564, + "step": 1238000 + }, + { + "epoch": 0.385005758765992, + "grad_norm": 9.397013664245605, + "learning_rate": 4.358323735390014e-05, + "loss": 3.0346, + "step": 1238500 + }, + { + "epoch": 0.3851611910464789, + "grad_norm": 7.54604434967041, + "learning_rate": 4.358064681589202e-05, + "loss": 3.0284, + "step": 1239000 + }, + { + "epoch": 0.3853166233269658, + "grad_norm": 6.863761901855469, + "learning_rate": 4.3578056277883905e-05, + "loss": 3.0378, + "step": 1239500 + }, + { + "epoch": 0.3854720556074527, + "grad_norm": 7.309162139892578, + "learning_rate": 4.357546573987579e-05, + "loss": 2.9952, + "step": 1240000 + }, + { + "epoch": 0.38562748788793955, + "grad_norm": 9.858902931213379, + "learning_rate": 4.357287520186767e-05, + "loss": 2.9971, + "step": 1240500 + }, + { + "epoch": 0.3857829201684264, + "grad_norm": 7.073546409606934, + "learning_rate": 4.357028466385956e-05, + "loss": 3.0076, + "step": 1241000 + }, + { + "epoch": 0.3859383524489133, + "grad_norm": 14.491243362426758, + "learning_rate": 4.356769412585145e-05, + "loss": 2.9809, + "step": 1241500 + }, + { + "epoch": 0.38609378472940015, + "grad_norm": 9.943365097045898, + "learning_rate": 4.3565103587843334e-05, + "loss": 3.0425, + "step": 1242000 + }, + { + "epoch": 0.38624921700988707, + "grad_norm": 5.878431797027588, + "learning_rate": 4.356251304983522e-05, + "loss": 3.0159, + "step": 1242500 + }, + { + "epoch": 0.38640464929037394, + "grad_norm": 8.112102508544922, + "learning_rate": 4.355992251182711e-05, + "loss": 3.014, + "step": 1243000 + }, + { + "epoch": 0.3865600815708608, + "grad_norm": 7.135739803314209, + "learning_rate": 4.355733197381899e-05, + "loss": 2.9736, + "step": 1243500 + }, + { + "epoch": 0.38671551385134767, + "grad_norm": 8.073102951049805, + "learning_rate": 4.3554741435810876e-05, + "loss": 3.0381, + "step": 1244000 + }, + { + "epoch": 0.38687094613183454, + "grad_norm": 12.645852088928223, + "learning_rate": 4.3552150897802756e-05, + "loss": 3.0188, + "step": 1244500 + }, + { + "epoch": 0.3870263784123214, + "grad_norm": 13.620457649230957, + "learning_rate": 4.3549560359794644e-05, + "loss": 3.0764, + "step": 1245000 + }, + { + "epoch": 0.3871818106928083, + "grad_norm": 9.10978889465332, + "learning_rate": 4.354696982178653e-05, + "loss": 3.0613, + "step": 1245500 + }, + { + "epoch": 0.3873372429732952, + "grad_norm": 4.441054344177246, + "learning_rate": 4.354437928377841e-05, + "loss": 3.0173, + "step": 1246000 + }, + { + "epoch": 0.38749267525378206, + "grad_norm": 7.1512908935546875, + "learning_rate": 4.35417887457703e-05, + "loss": 3.0287, + "step": 1246500 + }, + { + "epoch": 0.3876481075342689, + "grad_norm": 17.318614959716797, + "learning_rate": 4.3539198207762185e-05, + "loss": 3.0567, + "step": 1247000 + }, + { + "epoch": 0.3878035398147558, + "grad_norm": 7.4618706703186035, + "learning_rate": 4.353660766975407e-05, + "loss": 3.0339, + "step": 1247500 + }, + { + "epoch": 0.38795897209524266, + "grad_norm": 7.812060356140137, + "learning_rate": 4.353401713174596e-05, + "loss": 3.0182, + "step": 1248000 + }, + { + "epoch": 0.3881144043757296, + "grad_norm": 10.958283424377441, + "learning_rate": 4.353142659373785e-05, + "loss": 3.0148, + "step": 1248500 + }, + { + "epoch": 0.38826983665621645, + "grad_norm": 9.654011726379395, + "learning_rate": 4.352883605572973e-05, + "loss": 3.0152, + "step": 1249000 + }, + { + "epoch": 0.3884252689367033, + "grad_norm": 7.413597583770752, + "learning_rate": 4.3526245517721614e-05, + "loss": 3.0403, + "step": 1249500 + }, + { + "epoch": 0.3885807012171902, + "grad_norm": 8.649049758911133, + "learning_rate": 4.3523654979713495e-05, + "loss": 3.0303, + "step": 1250000 + }, + { + "epoch": 0.38873613349767705, + "grad_norm": 8.885530471801758, + "learning_rate": 4.352106444170538e-05, + "loss": 3.0342, + "step": 1250500 + }, + { + "epoch": 0.3888915657781639, + "grad_norm": 7.870588302612305, + "learning_rate": 4.351847390369727e-05, + "loss": 3.0471, + "step": 1251000 + }, + { + "epoch": 0.38904699805865084, + "grad_norm": 11.418512344360352, + "learning_rate": 4.3515883365689156e-05, + "loss": 3.0116, + "step": 1251500 + }, + { + "epoch": 0.3892024303391377, + "grad_norm": 53.59665298461914, + "learning_rate": 4.3513292827681043e-05, + "loss": 3.0324, + "step": 1252000 + }, + { + "epoch": 0.3893578626196246, + "grad_norm": 12.31152629852295, + "learning_rate": 4.351070228967293e-05, + "loss": 3.0222, + "step": 1252500 + }, + { + "epoch": 0.38951329490011144, + "grad_norm": 9.492338180541992, + "learning_rate": 4.350811175166481e-05, + "loss": 2.9708, + "step": 1253000 + }, + { + "epoch": 0.3896687271805983, + "grad_norm": 8.200867652893066, + "learning_rate": 4.35055212136567e-05, + "loss": 3.0171, + "step": 1253500 + }, + { + "epoch": 0.3898241594610852, + "grad_norm": 7.996339797973633, + "learning_rate": 4.3502930675648585e-05, + "loss": 2.9936, + "step": 1254000 + }, + { + "epoch": 0.3899795917415721, + "grad_norm": 9.30465030670166, + "learning_rate": 4.3500340137640466e-05, + "loss": 3.0584, + "step": 1254500 + }, + { + "epoch": 0.39013502402205896, + "grad_norm": 9.862898826599121, + "learning_rate": 4.349774959963235e-05, + "loss": 3.04, + "step": 1255000 + }, + { + "epoch": 0.39029045630254583, + "grad_norm": 6.557104110717773, + "learning_rate": 4.349515906162424e-05, + "loss": 3.0267, + "step": 1255500 + }, + { + "epoch": 0.3904458885830327, + "grad_norm": 8.016644477844238, + "learning_rate": 4.349256852361612e-05, + "loss": 3.0008, + "step": 1256000 + }, + { + "epoch": 0.39060132086351956, + "grad_norm": 9.572198867797852, + "learning_rate": 4.348997798560801e-05, + "loss": 3.0124, + "step": 1256500 + }, + { + "epoch": 0.39075675314400643, + "grad_norm": 10.095785140991211, + "learning_rate": 4.3487387447599895e-05, + "loss": 3.0533, + "step": 1257000 + }, + { + "epoch": 0.39091218542449335, + "grad_norm": 6.802495956420898, + "learning_rate": 4.348479690959178e-05, + "loss": 2.9639, + "step": 1257500 + }, + { + "epoch": 0.3910676177049802, + "grad_norm": 7.813571929931641, + "learning_rate": 4.348220637158367e-05, + "loss": 3.0422, + "step": 1258000 + }, + { + "epoch": 0.3912230499854671, + "grad_norm": 7.344969749450684, + "learning_rate": 4.347961583357555e-05, + "loss": 3.0735, + "step": 1258500 + }, + { + "epoch": 0.39137848226595395, + "grad_norm": 9.820842742919922, + "learning_rate": 4.3477025295567436e-05, + "loss": 3.0313, + "step": 1259000 + }, + { + "epoch": 0.3915339145464408, + "grad_norm": 9.195771217346191, + "learning_rate": 4.3474434757559324e-05, + "loss": 3.0125, + "step": 1259500 + }, + { + "epoch": 0.3916893468269277, + "grad_norm": 15.951432228088379, + "learning_rate": 4.3471844219551204e-05, + "loss": 3.0935, + "step": 1260000 + }, + { + "epoch": 0.3918447791074146, + "grad_norm": 8.687067031860352, + "learning_rate": 4.346925368154309e-05, + "loss": 2.9937, + "step": 1260500 + }, + { + "epoch": 0.3920002113879015, + "grad_norm": 7.413652420043945, + "learning_rate": 4.346666314353498e-05, + "loss": 3.0406, + "step": 1261000 + }, + { + "epoch": 0.39215564366838834, + "grad_norm": 7.547786712646484, + "learning_rate": 4.3464072605526865e-05, + "loss": 3.0135, + "step": 1261500 + }, + { + "epoch": 0.3923110759488752, + "grad_norm": 12.598599433898926, + "learning_rate": 4.346148206751875e-05, + "loss": 3.0129, + "step": 1262000 + }, + { + "epoch": 0.3924665082293621, + "grad_norm": 24.424209594726562, + "learning_rate": 4.345889152951063e-05, + "loss": 3.0733, + "step": 1262500 + }, + { + "epoch": 0.39262194050984894, + "grad_norm": 13.987735748291016, + "learning_rate": 4.345630099150252e-05, + "loss": 3.0207, + "step": 1263000 + }, + { + "epoch": 0.39277737279033587, + "grad_norm": 8.84871768951416, + "learning_rate": 4.345371045349441e-05, + "loss": 3.0183, + "step": 1263500 + }, + { + "epoch": 0.39293280507082273, + "grad_norm": 10.42096996307373, + "learning_rate": 4.345111991548629e-05, + "loss": 3.0111, + "step": 1264000 + }, + { + "epoch": 0.3930882373513096, + "grad_norm": 13.863884925842285, + "learning_rate": 4.3448529377478175e-05, + "loss": 3.0285, + "step": 1264500 + }, + { + "epoch": 0.39324366963179647, + "grad_norm": 13.663488388061523, + "learning_rate": 4.344593883947006e-05, + "loss": 3.022, + "step": 1265000 + }, + { + "epoch": 0.39339910191228333, + "grad_norm": 6.962210655212402, + "learning_rate": 4.344334830146194e-05, + "loss": 3.0141, + "step": 1265500 + }, + { + "epoch": 0.3935545341927702, + "grad_norm": 9.218218803405762, + "learning_rate": 4.344075776345383e-05, + "loss": 3.0662, + "step": 1266000 + }, + { + "epoch": 0.3937099664732571, + "grad_norm": 10.032159805297852, + "learning_rate": 4.343816722544572e-05, + "loss": 3.0358, + "step": 1266500 + }, + { + "epoch": 0.393865398753744, + "grad_norm": 6.8020710945129395, + "learning_rate": 4.3435576687437604e-05, + "loss": 3.0063, + "step": 1267000 + }, + { + "epoch": 0.39402083103423086, + "grad_norm": 7.425950050354004, + "learning_rate": 4.343298614942949e-05, + "loss": 2.9989, + "step": 1267500 + }, + { + "epoch": 0.3941762633147177, + "grad_norm": 8.164745330810547, + "learning_rate": 4.343039561142137e-05, + "loss": 2.9838, + "step": 1268000 + }, + { + "epoch": 0.3943316955952046, + "grad_norm": 10.36411190032959, + "learning_rate": 4.342780507341326e-05, + "loss": 2.9872, + "step": 1268500 + }, + { + "epoch": 0.39448712787569146, + "grad_norm": 10.527634620666504, + "learning_rate": 4.3425214535405146e-05, + "loss": 2.9947, + "step": 1269000 + }, + { + "epoch": 0.3946425601561784, + "grad_norm": 8.832425117492676, + "learning_rate": 4.3422623997397026e-05, + "loss": 3.0094, + "step": 1269500 + }, + { + "epoch": 0.39479799243666525, + "grad_norm": 8.921676635742188, + "learning_rate": 4.342003345938891e-05, + "loss": 3.018, + "step": 1270000 + }, + { + "epoch": 0.3949534247171521, + "grad_norm": 13.54818058013916, + "learning_rate": 4.34174429213808e-05, + "loss": 3.0544, + "step": 1270500 + }, + { + "epoch": 0.395108856997639, + "grad_norm": 8.154711723327637, + "learning_rate": 4.341485238337268e-05, + "loss": 3.0297, + "step": 1271000 + }, + { + "epoch": 0.39526428927812585, + "grad_norm": 8.13044261932373, + "learning_rate": 4.3412261845364575e-05, + "loss": 3.034, + "step": 1271500 + }, + { + "epoch": 0.3954197215586127, + "grad_norm": 7.555085182189941, + "learning_rate": 4.340967130735646e-05, + "loss": 3.0211, + "step": 1272000 + }, + { + "epoch": 0.39557515383909964, + "grad_norm": 7.310537815093994, + "learning_rate": 4.340708076934834e-05, + "loss": 2.9657, + "step": 1272500 + }, + { + "epoch": 0.3957305861195865, + "grad_norm": 6.106410026550293, + "learning_rate": 4.340449023134023e-05, + "loss": 3.0022, + "step": 1273000 + }, + { + "epoch": 0.39588601840007337, + "grad_norm": 8.500679016113281, + "learning_rate": 4.3401899693332116e-05, + "loss": 3.0171, + "step": 1273500 + }, + { + "epoch": 0.39604145068056024, + "grad_norm": 9.423917770385742, + "learning_rate": 4.3399309155324e-05, + "loss": 3.0923, + "step": 1274000 + }, + { + "epoch": 0.3961968829610471, + "grad_norm": 8.30810546875, + "learning_rate": 4.3396718617315884e-05, + "loss": 2.9549, + "step": 1274500 + }, + { + "epoch": 0.39635231524153397, + "grad_norm": 6.542405128479004, + "learning_rate": 4.3394128079307764e-05, + "loss": 3.0174, + "step": 1275000 + }, + { + "epoch": 0.3965077475220209, + "grad_norm": 8.50153923034668, + "learning_rate": 4.339153754129965e-05, + "loss": 3.042, + "step": 1275500 + }, + { + "epoch": 0.39666317980250776, + "grad_norm": 11.382680892944336, + "learning_rate": 4.338894700329154e-05, + "loss": 3.054, + "step": 1276000 + }, + { + "epoch": 0.3968186120829946, + "grad_norm": 13.635973930358887, + "learning_rate": 4.3386356465283426e-05, + "loss": 3.0023, + "step": 1276500 + }, + { + "epoch": 0.3969740443634815, + "grad_norm": 7.909348487854004, + "learning_rate": 4.338376592727531e-05, + "loss": 3.0065, + "step": 1277000 + }, + { + "epoch": 0.39712947664396836, + "grad_norm": 9.509486198425293, + "learning_rate": 4.33811753892672e-05, + "loss": 3.0552, + "step": 1277500 + }, + { + "epoch": 0.3972849089244552, + "grad_norm": 10.230456352233887, + "learning_rate": 4.337858485125908e-05, + "loss": 3.0, + "step": 1278000 + }, + { + "epoch": 0.39744034120494215, + "grad_norm": 7.553265571594238, + "learning_rate": 4.337599431325097e-05, + "loss": 3.0147, + "step": 1278500 + }, + { + "epoch": 0.397595773485429, + "grad_norm": 6.851576805114746, + "learning_rate": 4.3373403775242855e-05, + "loss": 3.0357, + "step": 1279000 + }, + { + "epoch": 0.3977512057659159, + "grad_norm": 7.840222358703613, + "learning_rate": 4.3370813237234735e-05, + "loss": 3.0395, + "step": 1279500 + }, + { + "epoch": 0.39790663804640275, + "grad_norm": 7.6064276695251465, + "learning_rate": 4.336822269922662e-05, + "loss": 3.0384, + "step": 1280000 + }, + { + "epoch": 0.3980620703268896, + "grad_norm": 12.899372100830078, + "learning_rate": 4.33656321612185e-05, + "loss": 3.0325, + "step": 1280500 + }, + { + "epoch": 0.3982175026073765, + "grad_norm": 9.645275115966797, + "learning_rate": 4.336304162321039e-05, + "loss": 2.9883, + "step": 1281000 + }, + { + "epoch": 0.3983729348878634, + "grad_norm": 7.107562065124512, + "learning_rate": 4.3360451085202284e-05, + "loss": 2.9977, + "step": 1281500 + }, + { + "epoch": 0.3985283671683503, + "grad_norm": 9.308304786682129, + "learning_rate": 4.3357860547194164e-05, + "loss": 3.0179, + "step": 1282000 + }, + { + "epoch": 0.39868379944883714, + "grad_norm": 9.396379470825195, + "learning_rate": 4.335527000918605e-05, + "loss": 3.052, + "step": 1282500 + }, + { + "epoch": 0.398839231729324, + "grad_norm": 8.345808982849121, + "learning_rate": 4.335267947117794e-05, + "loss": 3.0098, + "step": 1283000 + }, + { + "epoch": 0.3989946640098109, + "grad_norm": 8.945322036743164, + "learning_rate": 4.335008893316982e-05, + "loss": 3.0406, + "step": 1283500 + }, + { + "epoch": 0.39915009629029774, + "grad_norm": 19.422496795654297, + "learning_rate": 4.3347498395161706e-05, + "loss": 3.0608, + "step": 1284000 + }, + { + "epoch": 0.39930552857078466, + "grad_norm": 9.693379402160645, + "learning_rate": 4.334490785715359e-05, + "loss": 3.0331, + "step": 1284500 + }, + { + "epoch": 0.39946096085127153, + "grad_norm": 8.000107765197754, + "learning_rate": 4.3342317319145474e-05, + "loss": 3.0682, + "step": 1285000 + }, + { + "epoch": 0.3996163931317584, + "grad_norm": 13.810440063476562, + "learning_rate": 4.333972678113736e-05, + "loss": 2.9831, + "step": 1285500 + }, + { + "epoch": 0.39977182541224526, + "grad_norm": 8.180213928222656, + "learning_rate": 4.333713624312925e-05, + "loss": 3.025, + "step": 1286000 + }, + { + "epoch": 0.39992725769273213, + "grad_norm": 7.4429707527160645, + "learning_rate": 4.3334545705121135e-05, + "loss": 2.9849, + "step": 1286500 + }, + { + "epoch": 0.400082689973219, + "grad_norm": 9.063563346862793, + "learning_rate": 4.333195516711302e-05, + "loss": 3.0482, + "step": 1287000 + }, + { + "epoch": 0.4002381222537059, + "grad_norm": 7.4437971115112305, + "learning_rate": 4.33293646291049e-05, + "loss": 3.0174, + "step": 1287500 + }, + { + "epoch": 0.4003935545341928, + "grad_norm": 7.867112636566162, + "learning_rate": 4.332677409109679e-05, + "loss": 3.0255, + "step": 1288000 + }, + { + "epoch": 0.40054898681467965, + "grad_norm": 24.995323181152344, + "learning_rate": 4.332418355308868e-05, + "loss": 3.0542, + "step": 1288500 + }, + { + "epoch": 0.4007044190951665, + "grad_norm": 8.687703132629395, + "learning_rate": 4.332159301508056e-05, + "loss": 2.9878, + "step": 1289000 + }, + { + "epoch": 0.4008598513756534, + "grad_norm": 20.882640838623047, + "learning_rate": 4.3319002477072444e-05, + "loss": 3.0466, + "step": 1289500 + }, + { + "epoch": 0.40101528365614025, + "grad_norm": 12.433582305908203, + "learning_rate": 4.331641193906433e-05, + "loss": 2.9766, + "step": 1290000 + }, + { + "epoch": 0.4011707159366272, + "grad_norm": 8.964000701904297, + "learning_rate": 4.331382140105621e-05, + "loss": 2.9988, + "step": 1290500 + }, + { + "epoch": 0.40132614821711404, + "grad_norm": 9.981168746948242, + "learning_rate": 4.33112308630481e-05, + "loss": 2.9738, + "step": 1291000 + }, + { + "epoch": 0.4014815804976009, + "grad_norm": 9.554330825805664, + "learning_rate": 4.330864032503999e-05, + "loss": 3.029, + "step": 1291500 + }, + { + "epoch": 0.4016370127780878, + "grad_norm": 13.589818000793457, + "learning_rate": 4.330604978703187e-05, + "loss": 3.0203, + "step": 1292000 + }, + { + "epoch": 0.40179244505857464, + "grad_norm": 11.202045440673828, + "learning_rate": 4.330345924902376e-05, + "loss": 3.0225, + "step": 1292500 + }, + { + "epoch": 0.4019478773390615, + "grad_norm": 15.775449752807617, + "learning_rate": 4.330086871101564e-05, + "loss": 3.0307, + "step": 1293000 + }, + { + "epoch": 0.40210330961954843, + "grad_norm": 9.299273490905762, + "learning_rate": 4.329827817300753e-05, + "loss": 3.0331, + "step": 1293500 + }, + { + "epoch": 0.4022587419000353, + "grad_norm": 33.04179382324219, + "learning_rate": 4.3295687634999415e-05, + "loss": 3.0036, + "step": 1294000 + }, + { + "epoch": 0.40241417418052217, + "grad_norm": 8.37044906616211, + "learning_rate": 4.3293097096991296e-05, + "loss": 3.0276, + "step": 1294500 + }, + { + "epoch": 0.40256960646100903, + "grad_norm": 12.284272193908691, + "learning_rate": 4.329050655898318e-05, + "loss": 3.0342, + "step": 1295000 + }, + { + "epoch": 0.4027250387414959, + "grad_norm": 8.589241027832031, + "learning_rate": 4.328791602097507e-05, + "loss": 3.0432, + "step": 1295500 + }, + { + "epoch": 0.40288047102198277, + "grad_norm": 14.318597793579102, + "learning_rate": 4.328532548296696e-05, + "loss": 2.9799, + "step": 1296000 + }, + { + "epoch": 0.4030359033024697, + "grad_norm": 7.192989826202393, + "learning_rate": 4.3282734944958844e-05, + "loss": 3.0363, + "step": 1296500 + }, + { + "epoch": 0.40319133558295656, + "grad_norm": 8.09339714050293, + "learning_rate": 4.328014440695073e-05, + "loss": 3.0128, + "step": 1297000 + }, + { + "epoch": 0.4033467678634434, + "grad_norm": 8.879638671875, + "learning_rate": 4.327755386894261e-05, + "loss": 3.0131, + "step": 1297500 + }, + { + "epoch": 0.4035022001439303, + "grad_norm": 8.99708366394043, + "learning_rate": 4.32749633309345e-05, + "loss": 2.9716, + "step": 1298000 + }, + { + "epoch": 0.40365763242441716, + "grad_norm": 10.59968090057373, + "learning_rate": 4.327237279292638e-05, + "loss": 2.9891, + "step": 1298500 + }, + { + "epoch": 0.403813064704904, + "grad_norm": 6.54097318649292, + "learning_rate": 4.3269782254918266e-05, + "loss": 3.0305, + "step": 1299000 + }, + { + "epoch": 0.40396849698539095, + "grad_norm": 9.583012580871582, + "learning_rate": 4.3267191716910154e-05, + "loss": 3.0229, + "step": 1299500 + }, + { + "epoch": 0.4041239292658778, + "grad_norm": 32.133995056152344, + "learning_rate": 4.3264601178902034e-05, + "loss": 3.018, + "step": 1300000 + }, + { + "epoch": 0.4042793615463647, + "grad_norm": 6.880998134613037, + "learning_rate": 4.326201064089392e-05, + "loss": 2.9978, + "step": 1300500 + }, + { + "epoch": 0.40443479382685155, + "grad_norm": 6.730745792388916, + "learning_rate": 4.325942010288581e-05, + "loss": 3.0023, + "step": 1301000 + }, + { + "epoch": 0.4045902261073384, + "grad_norm": 12.036593437194824, + "learning_rate": 4.3256829564877695e-05, + "loss": 3.0289, + "step": 1301500 + }, + { + "epoch": 0.4047456583878253, + "grad_norm": 13.934772491455078, + "learning_rate": 4.325423902686958e-05, + "loss": 3.0249, + "step": 1302000 + }, + { + "epoch": 0.4049010906683122, + "grad_norm": 19.64508628845215, + "learning_rate": 4.325164848886147e-05, + "loss": 3.0367, + "step": 1302500 + }, + { + "epoch": 0.40505652294879907, + "grad_norm": 8.978446960449219, + "learning_rate": 4.324905795085335e-05, + "loss": 3.0372, + "step": 1303000 + }, + { + "epoch": 0.40521195522928594, + "grad_norm": 7.464851379394531, + "learning_rate": 4.324646741284524e-05, + "loss": 2.9944, + "step": 1303500 + }, + { + "epoch": 0.4053673875097728, + "grad_norm": 8.566202163696289, + "learning_rate": 4.324387687483712e-05, + "loss": 2.9746, + "step": 1304000 + }, + { + "epoch": 0.40552281979025967, + "grad_norm": 9.78466796875, + "learning_rate": 4.3241286336829005e-05, + "loss": 3.0303, + "step": 1304500 + }, + { + "epoch": 0.40567825207074654, + "grad_norm": 9.025242805480957, + "learning_rate": 4.323869579882089e-05, + "loss": 3.0414, + "step": 1305000 + }, + { + "epoch": 0.40583368435123346, + "grad_norm": 6.625077247619629, + "learning_rate": 4.323610526081278e-05, + "loss": 3.0208, + "step": 1305500 + }, + { + "epoch": 0.4059891166317203, + "grad_norm": 8.772872924804688, + "learning_rate": 4.3233514722804666e-05, + "loss": 2.9777, + "step": 1306000 + }, + { + "epoch": 0.4061445489122072, + "grad_norm": 9.322205543518066, + "learning_rate": 4.323092418479655e-05, + "loss": 2.9681, + "step": 1306500 + }, + { + "epoch": 0.40629998119269406, + "grad_norm": 7.773362159729004, + "learning_rate": 4.3228333646788434e-05, + "loss": 3.0097, + "step": 1307000 + }, + { + "epoch": 0.4064554134731809, + "grad_norm": 8.884862899780273, + "learning_rate": 4.322574310878032e-05, + "loss": 3.0059, + "step": 1307500 + }, + { + "epoch": 0.4066108457536678, + "grad_norm": 18.609895706176758, + "learning_rate": 4.322315257077221e-05, + "loss": 3.0354, + "step": 1308000 + }, + { + "epoch": 0.4067662780341547, + "grad_norm": 8.135658264160156, + "learning_rate": 4.322056203276409e-05, + "loss": 2.9763, + "step": 1308500 + }, + { + "epoch": 0.4069217103146416, + "grad_norm": 8.290506362915039, + "learning_rate": 4.3217971494755976e-05, + "loss": 3.0113, + "step": 1309000 + }, + { + "epoch": 0.40707714259512845, + "grad_norm": 8.608513832092285, + "learning_rate": 4.321538095674786e-05, + "loss": 3.0145, + "step": 1309500 + }, + { + "epoch": 0.4072325748756153, + "grad_norm": 7.497950553894043, + "learning_rate": 4.321279041873974e-05, + "loss": 3.0466, + "step": 1310000 + }, + { + "epoch": 0.4073880071561022, + "grad_norm": 7.92622184753418, + "learning_rate": 4.321019988073163e-05, + "loss": 3.0606, + "step": 1310500 + }, + { + "epoch": 0.40754343943658905, + "grad_norm": 9.604890823364258, + "learning_rate": 4.320760934272352e-05, + "loss": 2.9989, + "step": 1311000 + }, + { + "epoch": 0.407698871717076, + "grad_norm": 8.266107559204102, + "learning_rate": 4.3205018804715405e-05, + "loss": 2.9899, + "step": 1311500 + }, + { + "epoch": 0.40785430399756284, + "grad_norm": 7.851831436157227, + "learning_rate": 4.320242826670729e-05, + "loss": 3.0075, + "step": 1312000 + }, + { + "epoch": 0.4080097362780497, + "grad_norm": 6.458501815795898, + "learning_rate": 4.319983772869917e-05, + "loss": 2.9971, + "step": 1312500 + }, + { + "epoch": 0.4081651685585366, + "grad_norm": 11.517350196838379, + "learning_rate": 4.319724719069106e-05, + "loss": 3.0239, + "step": 1313000 + }, + { + "epoch": 0.40832060083902344, + "grad_norm": 8.433989524841309, + "learning_rate": 4.3194656652682946e-05, + "loss": 3.044, + "step": 1313500 + }, + { + "epoch": 0.4084760331195103, + "grad_norm": 6.999762058258057, + "learning_rate": 4.319206611467483e-05, + "loss": 2.973, + "step": 1314000 + }, + { + "epoch": 0.40863146539999723, + "grad_norm": 8.324854850769043, + "learning_rate": 4.3189475576666714e-05, + "loss": 2.9738, + "step": 1314500 + }, + { + "epoch": 0.4087868976804841, + "grad_norm": 8.180917739868164, + "learning_rate": 4.31868850386586e-05, + "loss": 2.9668, + "step": 1315000 + }, + { + "epoch": 0.40894232996097096, + "grad_norm": 8.991897583007812, + "learning_rate": 4.318429450065049e-05, + "loss": 2.9917, + "step": 1315500 + }, + { + "epoch": 0.40909776224145783, + "grad_norm": 10.214797973632812, + "learning_rate": 4.3181703962642375e-05, + "loss": 3.0482, + "step": 1316000 + }, + { + "epoch": 0.4092531945219447, + "grad_norm": 6.879065036773682, + "learning_rate": 4.3179113424634256e-05, + "loss": 3.0144, + "step": 1316500 + }, + { + "epoch": 0.40940862680243156, + "grad_norm": 7.250428676605225, + "learning_rate": 4.317652288662614e-05, + "loss": 3.012, + "step": 1317000 + }, + { + "epoch": 0.4095640590829185, + "grad_norm": 10.291481018066406, + "learning_rate": 4.317393234861803e-05, + "loss": 2.9994, + "step": 1317500 + }, + { + "epoch": 0.40971949136340535, + "grad_norm": 9.289000511169434, + "learning_rate": 4.317134181060991e-05, + "loss": 3.0257, + "step": 1318000 + }, + { + "epoch": 0.4098749236438922, + "grad_norm": 9.058664321899414, + "learning_rate": 4.31687512726018e-05, + "loss": 3.0172, + "step": 1318500 + }, + { + "epoch": 0.4100303559243791, + "grad_norm": 7.720111846923828, + "learning_rate": 4.3166160734593685e-05, + "loss": 3.0294, + "step": 1319000 + }, + { + "epoch": 0.41018578820486595, + "grad_norm": 12.397781372070312, + "learning_rate": 4.3163570196585565e-05, + "loss": 3.0242, + "step": 1319500 + }, + { + "epoch": 0.4103412204853528, + "grad_norm": 15.056527137756348, + "learning_rate": 4.316097965857745e-05, + "loss": 3.0443, + "step": 1320000 + }, + { + "epoch": 0.41049665276583974, + "grad_norm": 17.083972930908203, + "learning_rate": 4.315838912056934e-05, + "loss": 3.0352, + "step": 1320500 + }, + { + "epoch": 0.4106520850463266, + "grad_norm": 17.020593643188477, + "learning_rate": 4.315579858256123e-05, + "loss": 3.0612, + "step": 1321000 + }, + { + "epoch": 0.4108075173268135, + "grad_norm": 8.826165199279785, + "learning_rate": 4.3153208044553114e-05, + "loss": 3.0267, + "step": 1321500 + }, + { + "epoch": 0.41096294960730034, + "grad_norm": 7.392735004425049, + "learning_rate": 4.3150617506544994e-05, + "loss": 2.9921, + "step": 1322000 + }, + { + "epoch": 0.4111183818877872, + "grad_norm": 10.466955184936523, + "learning_rate": 4.314802696853688e-05, + "loss": 3.0154, + "step": 1322500 + }, + { + "epoch": 0.4112738141682741, + "grad_norm": 7.447009086608887, + "learning_rate": 4.314543643052877e-05, + "loss": 3.0517, + "step": 1323000 + }, + { + "epoch": 0.411429246448761, + "grad_norm": 7.935750484466553, + "learning_rate": 4.314284589252065e-05, + "loss": 2.9997, + "step": 1323500 + }, + { + "epoch": 0.41158467872924787, + "grad_norm": 8.44989013671875, + "learning_rate": 4.3140255354512536e-05, + "loss": 3.0031, + "step": 1324000 + }, + { + "epoch": 0.41174011100973473, + "grad_norm": 8.243706703186035, + "learning_rate": 4.313766481650442e-05, + "loss": 2.9883, + "step": 1324500 + }, + { + "epoch": 0.4118955432902216, + "grad_norm": 9.83686637878418, + "learning_rate": 4.3135074278496304e-05, + "loss": 3.0121, + "step": 1325000 + }, + { + "epoch": 0.41205097557070847, + "grad_norm": 8.759627342224121, + "learning_rate": 4.31324837404882e-05, + "loss": 2.9786, + "step": 1325500 + }, + { + "epoch": 0.41220640785119533, + "grad_norm": 38.831172943115234, + "learning_rate": 4.3129893202480085e-05, + "loss": 3.0366, + "step": 1326000 + }, + { + "epoch": 0.41236184013168226, + "grad_norm": 11.366860389709473, + "learning_rate": 4.3127302664471965e-05, + "loss": 3.0232, + "step": 1326500 + }, + { + "epoch": 0.4125172724121691, + "grad_norm": 8.837693214416504, + "learning_rate": 4.312471212646385e-05, + "loss": 3.0772, + "step": 1327000 + }, + { + "epoch": 0.412672704692656, + "grad_norm": 6.364861488342285, + "learning_rate": 4.312212158845574e-05, + "loss": 3.0489, + "step": 1327500 + }, + { + "epoch": 0.41282813697314286, + "grad_norm": 9.078208923339844, + "learning_rate": 4.311953105044762e-05, + "loss": 2.9545, + "step": 1328000 + }, + { + "epoch": 0.4129835692536297, + "grad_norm": 21.424070358276367, + "learning_rate": 4.311694051243951e-05, + "loss": 2.9786, + "step": 1328500 + }, + { + "epoch": 0.4131390015341166, + "grad_norm": 9.242812156677246, + "learning_rate": 4.311434997443139e-05, + "loss": 2.9755, + "step": 1329000 + }, + { + "epoch": 0.41329443381460346, + "grad_norm": 8.9983549118042, + "learning_rate": 4.3111759436423274e-05, + "loss": 3.0476, + "step": 1329500 + }, + { + "epoch": 0.4134498660950904, + "grad_norm": 9.033163070678711, + "learning_rate": 4.310916889841516e-05, + "loss": 3.0284, + "step": 1330000 + }, + { + "epoch": 0.41360529837557725, + "grad_norm": 8.145248413085938, + "learning_rate": 4.310657836040705e-05, + "loss": 3.0031, + "step": 1330500 + }, + { + "epoch": 0.4137607306560641, + "grad_norm": 8.62743091583252, + "learning_rate": 4.3103987822398936e-05, + "loss": 3.0051, + "step": 1331000 + }, + { + "epoch": 0.413916162936551, + "grad_norm": 10.58740234375, + "learning_rate": 4.310139728439082e-05, + "loss": 2.9896, + "step": 1331500 + }, + { + "epoch": 0.41407159521703785, + "grad_norm": 7.564646244049072, + "learning_rate": 4.30988067463827e-05, + "loss": 3.005, + "step": 1332000 + }, + { + "epoch": 0.4142270274975247, + "grad_norm": 6.183964729309082, + "learning_rate": 4.309621620837459e-05, + "loss": 2.9574, + "step": 1332500 + }, + { + "epoch": 0.41438245977801164, + "grad_norm": 7.565702438354492, + "learning_rate": 4.309362567036648e-05, + "loss": 3.0163, + "step": 1333000 + }, + { + "epoch": 0.4145378920584985, + "grad_norm": 14.001727104187012, + "learning_rate": 4.309103513235836e-05, + "loss": 3.008, + "step": 1333500 + }, + { + "epoch": 0.41469332433898537, + "grad_norm": 7.881333351135254, + "learning_rate": 4.3088444594350245e-05, + "loss": 3.041, + "step": 1334000 + }, + { + "epoch": 0.41484875661947224, + "grad_norm": 9.861733436584473, + "learning_rate": 4.3085854056342126e-05, + "loss": 3.0422, + "step": 1334500 + }, + { + "epoch": 0.4150041888999591, + "grad_norm": 8.360870361328125, + "learning_rate": 4.308326351833401e-05, + "loss": 3.0434, + "step": 1335000 + }, + { + "epoch": 0.41515962118044597, + "grad_norm": 9.898003578186035, + "learning_rate": 4.308067298032591e-05, + "loss": 2.9849, + "step": 1335500 + }, + { + "epoch": 0.4153150534609329, + "grad_norm": 7.412974834442139, + "learning_rate": 4.307808244231779e-05, + "loss": 3.0447, + "step": 1336000 + }, + { + "epoch": 0.41547048574141976, + "grad_norm": 7.954694747924805, + "learning_rate": 4.3075491904309674e-05, + "loss": 2.9914, + "step": 1336500 + }, + { + "epoch": 0.4156259180219066, + "grad_norm": 9.346441268920898, + "learning_rate": 4.307290136630156e-05, + "loss": 3.0486, + "step": 1337000 + }, + { + "epoch": 0.4157813503023935, + "grad_norm": 7.850369453430176, + "learning_rate": 4.307031082829344e-05, + "loss": 2.9665, + "step": 1337500 + }, + { + "epoch": 0.41593678258288036, + "grad_norm": 9.547650337219238, + "learning_rate": 4.306772029028533e-05, + "loss": 3.0275, + "step": 1338000 + }, + { + "epoch": 0.4160922148633672, + "grad_norm": 8.75617790222168, + "learning_rate": 4.3065129752277216e-05, + "loss": 3.0464, + "step": 1338500 + }, + { + "epoch": 0.41624764714385415, + "grad_norm": 7.503572463989258, + "learning_rate": 4.3062539214269096e-05, + "loss": 3.0738, + "step": 1339000 + }, + { + "epoch": 0.416403079424341, + "grad_norm": 10.265230178833008, + "learning_rate": 4.3059948676260984e-05, + "loss": 3.0232, + "step": 1339500 + }, + { + "epoch": 0.4165585117048279, + "grad_norm": 7.608583450317383, + "learning_rate": 4.305735813825287e-05, + "loss": 2.9784, + "step": 1340000 + }, + { + "epoch": 0.41671394398531475, + "grad_norm": 12.094138145446777, + "learning_rate": 4.305476760024476e-05, + "loss": 3.0302, + "step": 1340500 + }, + { + "epoch": 0.4168693762658016, + "grad_norm": 6.539437770843506, + "learning_rate": 4.3052177062236645e-05, + "loss": 3.0357, + "step": 1341000 + }, + { + "epoch": 0.4170248085462885, + "grad_norm": 6.6922383308410645, + "learning_rate": 4.3049586524228525e-05, + "loss": 2.9546, + "step": 1341500 + }, + { + "epoch": 0.4171802408267754, + "grad_norm": 6.452007293701172, + "learning_rate": 4.304699598622041e-05, + "loss": 3.0445, + "step": 1342000 + }, + { + "epoch": 0.4173356731072623, + "grad_norm": 6.8176350593566895, + "learning_rate": 4.30444054482123e-05, + "loss": 3.0005, + "step": 1342500 + }, + { + "epoch": 0.41749110538774914, + "grad_norm": 9.36044692993164, + "learning_rate": 4.304181491020418e-05, + "loss": 3.0575, + "step": 1343000 + }, + { + "epoch": 0.417646537668236, + "grad_norm": 6.92308235168457, + "learning_rate": 4.303922437219607e-05, + "loss": 3.0097, + "step": 1343500 + }, + { + "epoch": 0.4178019699487229, + "grad_norm": 8.705224990844727, + "learning_rate": 4.3036633834187954e-05, + "loss": 3.0798, + "step": 1344000 + }, + { + "epoch": 0.41795740222920974, + "grad_norm": 7.686152935028076, + "learning_rate": 4.3034043296179835e-05, + "loss": 3.0055, + "step": 1344500 + }, + { + "epoch": 0.41811283450969666, + "grad_norm": 7.5312724113464355, + "learning_rate": 4.303145275817172e-05, + "loss": 3.0347, + "step": 1345000 + }, + { + "epoch": 0.41826826679018353, + "grad_norm": 8.21970272064209, + "learning_rate": 4.3028862220163616e-05, + "loss": 3.0076, + "step": 1345500 + }, + { + "epoch": 0.4184236990706704, + "grad_norm": 6.135211944580078, + "learning_rate": 4.3026271682155496e-05, + "loss": 3.0144, + "step": 1346000 + }, + { + "epoch": 0.41857913135115726, + "grad_norm": 9.076484680175781, + "learning_rate": 4.302368114414738e-05, + "loss": 3.0146, + "step": 1346500 + }, + { + "epoch": 0.41873456363164413, + "grad_norm": 7.678025245666504, + "learning_rate": 4.3021090606139264e-05, + "loss": 3.0127, + "step": 1347000 + }, + { + "epoch": 0.418889995912131, + "grad_norm": 8.007664680480957, + "learning_rate": 4.301850006813115e-05, + "loss": 3.0212, + "step": 1347500 + }, + { + "epoch": 0.4190454281926179, + "grad_norm": 10.046514511108398, + "learning_rate": 4.301590953012304e-05, + "loss": 2.9774, + "step": 1348000 + }, + { + "epoch": 0.4192008604731048, + "grad_norm": 6.875695705413818, + "learning_rate": 4.301331899211492e-05, + "loss": 2.9921, + "step": 1348500 + }, + { + "epoch": 0.41935629275359165, + "grad_norm": 11.046422958374023, + "learning_rate": 4.3010728454106806e-05, + "loss": 2.9814, + "step": 1349000 + }, + { + "epoch": 0.4195117250340785, + "grad_norm": 14.297409057617188, + "learning_rate": 4.300813791609869e-05, + "loss": 3.0457, + "step": 1349500 + }, + { + "epoch": 0.4196671573145654, + "grad_norm": 7.750283241271973, + "learning_rate": 4.300554737809058e-05, + "loss": 3.0604, + "step": 1350000 + }, + { + "epoch": 0.41982258959505225, + "grad_norm": 22.405672073364258, + "learning_rate": 4.300295684008247e-05, + "loss": 2.9731, + "step": 1350500 + }, + { + "epoch": 0.4199780218755392, + "grad_norm": 11.25070858001709, + "learning_rate": 4.3000366302074354e-05, + "loss": 2.9464, + "step": 1351000 + }, + { + "epoch": 0.42013345415602604, + "grad_norm": 6.305685043334961, + "learning_rate": 4.2997775764066235e-05, + "loss": 2.9994, + "step": 1351500 + }, + { + "epoch": 0.4202888864365129, + "grad_norm": 8.216525077819824, + "learning_rate": 4.299518522605812e-05, + "loss": 3.0079, + "step": 1352000 + }, + { + "epoch": 0.4204443187169998, + "grad_norm": 9.095291137695312, + "learning_rate": 4.299259468805e-05, + "loss": 3.0273, + "step": 1352500 + }, + { + "epoch": 0.42059975099748664, + "grad_norm": 9.922999382019043, + "learning_rate": 4.299000415004189e-05, + "loss": 2.9916, + "step": 1353000 + }, + { + "epoch": 0.4207551832779735, + "grad_norm": 9.658455848693848, + "learning_rate": 4.2987413612033776e-05, + "loss": 3.0084, + "step": 1353500 + }, + { + "epoch": 0.42091061555846043, + "grad_norm": 6.451661109924316, + "learning_rate": 4.298482307402566e-05, + "loss": 3.0086, + "step": 1354000 + }, + { + "epoch": 0.4210660478389473, + "grad_norm": 6.02064847946167, + "learning_rate": 4.2982232536017544e-05, + "loss": 3.0112, + "step": 1354500 + }, + { + "epoch": 0.42122148011943417, + "grad_norm": 9.586270332336426, + "learning_rate": 4.297964199800944e-05, + "loss": 2.9704, + "step": 1355000 + }, + { + "epoch": 0.42137691239992103, + "grad_norm": 8.84138011932373, + "learning_rate": 4.297705146000132e-05, + "loss": 3.0477, + "step": 1355500 + }, + { + "epoch": 0.4215323446804079, + "grad_norm": 9.96894359588623, + "learning_rate": 4.2974460921993205e-05, + "loss": 3.0005, + "step": 1356000 + }, + { + "epoch": 0.42168777696089477, + "grad_norm": 8.990548133850098, + "learning_rate": 4.297187038398509e-05, + "loss": 3.0469, + "step": 1356500 + }, + { + "epoch": 0.4218432092413817, + "grad_norm": 9.246652603149414, + "learning_rate": 4.296927984597697e-05, + "loss": 2.9931, + "step": 1357000 + }, + { + "epoch": 0.42199864152186856, + "grad_norm": 25.8072452545166, + "learning_rate": 4.296668930796886e-05, + "loss": 3.001, + "step": 1357500 + }, + { + "epoch": 0.4221540738023554, + "grad_norm": 6.915383815765381, + "learning_rate": 4.296409876996075e-05, + "loss": 2.9516, + "step": 1358000 + }, + { + "epoch": 0.4223095060828423, + "grad_norm": 7.596576690673828, + "learning_rate": 4.296150823195263e-05, + "loss": 3.014, + "step": 1358500 + }, + { + "epoch": 0.42246493836332916, + "grad_norm": 9.589164733886719, + "learning_rate": 4.2958917693944515e-05, + "loss": 3.0047, + "step": 1359000 + }, + { + "epoch": 0.422620370643816, + "grad_norm": 8.873188972473145, + "learning_rate": 4.29563271559364e-05, + "loss": 2.9781, + "step": 1359500 + }, + { + "epoch": 0.42277580292430295, + "grad_norm": 8.152970314025879, + "learning_rate": 4.295373661792829e-05, + "loss": 2.9823, + "step": 1360000 + }, + { + "epoch": 0.4229312352047898, + "grad_norm": 11.409231185913086, + "learning_rate": 4.2951146079920176e-05, + "loss": 3.02, + "step": 1360500 + }, + { + "epoch": 0.4230866674852767, + "grad_norm": 8.223034858703613, + "learning_rate": 4.2948555541912057e-05, + "loss": 2.9853, + "step": 1361000 + }, + { + "epoch": 0.42324209976576355, + "grad_norm": 8.958937644958496, + "learning_rate": 4.2945965003903944e-05, + "loss": 2.9966, + "step": 1361500 + }, + { + "epoch": 0.4233975320462504, + "grad_norm": 10.350997924804688, + "learning_rate": 4.294337446589583e-05, + "loss": 2.9816, + "step": 1362000 + }, + { + "epoch": 0.4235529643267373, + "grad_norm": 7.938509464263916, + "learning_rate": 4.294078392788771e-05, + "loss": 3.0153, + "step": 1362500 + }, + { + "epoch": 0.4237083966072242, + "grad_norm": 8.24617862701416, + "learning_rate": 4.29381933898796e-05, + "loss": 3.0034, + "step": 1363000 + }, + { + "epoch": 0.42386382888771107, + "grad_norm": 8.371079444885254, + "learning_rate": 4.2935602851871486e-05, + "loss": 3.0592, + "step": 1363500 + }, + { + "epoch": 0.42401926116819794, + "grad_norm": 9.348672866821289, + "learning_rate": 4.2933012313863366e-05, + "loss": 2.9995, + "step": 1364000 + }, + { + "epoch": 0.4241746934486848, + "grad_norm": 8.985843658447266, + "learning_rate": 4.293042177585525e-05, + "loss": 3.012, + "step": 1364500 + }, + { + "epoch": 0.42433012572917167, + "grad_norm": 18.982446670532227, + "learning_rate": 4.292783123784714e-05, + "loss": 3.0323, + "step": 1365000 + }, + { + "epoch": 0.42448555800965854, + "grad_norm": 14.388622283935547, + "learning_rate": 4.292524069983903e-05, + "loss": 3.0094, + "step": 1365500 + }, + { + "epoch": 0.42464099029014546, + "grad_norm": 10.625821113586426, + "learning_rate": 4.2922650161830915e-05, + "loss": 2.9862, + "step": 1366000 + }, + { + "epoch": 0.4247964225706323, + "grad_norm": 7.396295070648193, + "learning_rate": 4.2920059623822795e-05, + "loss": 2.9954, + "step": 1366500 + }, + { + "epoch": 0.4249518548511192, + "grad_norm": 9.365406036376953, + "learning_rate": 4.291746908581468e-05, + "loss": 3.0182, + "step": 1367000 + }, + { + "epoch": 0.42510728713160606, + "grad_norm": 8.177726745605469, + "learning_rate": 4.291487854780657e-05, + "loss": 3.0117, + "step": 1367500 + }, + { + "epoch": 0.4252627194120929, + "grad_norm": 11.528486251831055, + "learning_rate": 4.291228800979845e-05, + "loss": 3.0724, + "step": 1368000 + }, + { + "epoch": 0.4254181516925798, + "grad_norm": 8.801376342773438, + "learning_rate": 4.290969747179034e-05, + "loss": 3.033, + "step": 1368500 + }, + { + "epoch": 0.4255735839730667, + "grad_norm": 7.477367877960205, + "learning_rate": 4.2907106933782224e-05, + "loss": 3.0017, + "step": 1369000 + }, + { + "epoch": 0.4257290162535536, + "grad_norm": 10.253073692321777, + "learning_rate": 4.290451639577411e-05, + "loss": 3.0388, + "step": 1369500 + }, + { + "epoch": 0.42588444853404045, + "grad_norm": 11.690999984741211, + "learning_rate": 4.2901925857766e-05, + "loss": 3.0025, + "step": 1370000 + }, + { + "epoch": 0.4260398808145273, + "grad_norm": 11.415396690368652, + "learning_rate": 4.289933531975788e-05, + "loss": 3.0318, + "step": 1370500 + }, + { + "epoch": 0.4261953130950142, + "grad_norm": 5.311564922332764, + "learning_rate": 4.2896744781749766e-05, + "loss": 2.9931, + "step": 1371000 + }, + { + "epoch": 0.42635074537550105, + "grad_norm": 9.723118782043457, + "learning_rate": 4.289415424374165e-05, + "loss": 3.0261, + "step": 1371500 + }, + { + "epoch": 0.426506177655988, + "grad_norm": 6.77277135848999, + "learning_rate": 4.289156370573353e-05, + "loss": 3.0358, + "step": 1372000 + }, + { + "epoch": 0.42666160993647484, + "grad_norm": 11.935309410095215, + "learning_rate": 4.288897316772542e-05, + "loss": 3.0645, + "step": 1372500 + }, + { + "epoch": 0.4268170422169617, + "grad_norm": 8.348891258239746, + "learning_rate": 4.288638262971731e-05, + "loss": 2.9984, + "step": 1373000 + }, + { + "epoch": 0.4269724744974486, + "grad_norm": 10.535664558410645, + "learning_rate": 4.288379209170919e-05, + "loss": 3.0189, + "step": 1373500 + }, + { + "epoch": 0.42712790677793544, + "grad_norm": 12.704721450805664, + "learning_rate": 4.2881201553701075e-05, + "loss": 2.9923, + "step": 1374000 + }, + { + "epoch": 0.4272833390584223, + "grad_norm": 13.306448936462402, + "learning_rate": 4.287861101569296e-05, + "loss": 3.0594, + "step": 1374500 + }, + { + "epoch": 0.42743877133890923, + "grad_norm": 7.691682815551758, + "learning_rate": 4.287602047768485e-05, + "loss": 2.983, + "step": 1375000 + }, + { + "epoch": 0.4275942036193961, + "grad_norm": 9.345656394958496, + "learning_rate": 4.2873429939676737e-05, + "loss": 3.0166, + "step": 1375500 + }, + { + "epoch": 0.42774963589988296, + "grad_norm": 7.950403213500977, + "learning_rate": 4.2870839401668624e-05, + "loss": 3.0235, + "step": 1376000 + }, + { + "epoch": 0.42790506818036983, + "grad_norm": 6.963176727294922, + "learning_rate": 4.2868248863660504e-05, + "loss": 3.01, + "step": 1376500 + }, + { + "epoch": 0.4280605004608567, + "grad_norm": 8.23904800415039, + "learning_rate": 4.286565832565239e-05, + "loss": 2.991, + "step": 1377000 + }, + { + "epoch": 0.42821593274134356, + "grad_norm": 12.555656433105469, + "learning_rate": 4.286306778764427e-05, + "loss": 2.9854, + "step": 1377500 + }, + { + "epoch": 0.4283713650218305, + "grad_norm": 8.686471939086914, + "learning_rate": 4.286047724963616e-05, + "loss": 3.0223, + "step": 1378000 + }, + { + "epoch": 0.42852679730231735, + "grad_norm": 27.87056541442871, + "learning_rate": 4.2857886711628046e-05, + "loss": 3.0043, + "step": 1378500 + }, + { + "epoch": 0.4286822295828042, + "grad_norm": 8.523122787475586, + "learning_rate": 4.285529617361993e-05, + "loss": 3.0309, + "step": 1379000 + }, + { + "epoch": 0.4288376618632911, + "grad_norm": 8.052837371826172, + "learning_rate": 4.285270563561182e-05, + "loss": 3.0137, + "step": 1379500 + }, + { + "epoch": 0.42899309414377795, + "grad_norm": 9.328721046447754, + "learning_rate": 4.285011509760371e-05, + "loss": 3.0343, + "step": 1380000 + }, + { + "epoch": 0.4291485264242648, + "grad_norm": 10.354205131530762, + "learning_rate": 4.284752455959559e-05, + "loss": 3.0219, + "step": 1380500 + }, + { + "epoch": 0.42930395870475174, + "grad_norm": 9.881058692932129, + "learning_rate": 4.2844934021587475e-05, + "loss": 3.013, + "step": 1381000 + }, + { + "epoch": 0.4294593909852386, + "grad_norm": 16.144311904907227, + "learning_rate": 4.284234348357936e-05, + "loss": 3.0006, + "step": 1381500 + }, + { + "epoch": 0.4296148232657255, + "grad_norm": 7.619064807891846, + "learning_rate": 4.283975294557124e-05, + "loss": 2.9732, + "step": 1382000 + }, + { + "epoch": 0.42977025554621234, + "grad_norm": 8.249123573303223, + "learning_rate": 4.283716240756313e-05, + "loss": 3.0048, + "step": 1382500 + }, + { + "epoch": 0.4299256878266992, + "grad_norm": 21.568374633789062, + "learning_rate": 4.283457186955501e-05, + "loss": 3.0064, + "step": 1383000 + }, + { + "epoch": 0.4300811201071861, + "grad_norm": 7.264678478240967, + "learning_rate": 4.28319813315469e-05, + "loss": 3.0341, + "step": 1383500 + }, + { + "epoch": 0.430236552387673, + "grad_norm": 23.936904907226562, + "learning_rate": 4.2829390793538784e-05, + "loss": 3.0078, + "step": 1384000 + }, + { + "epoch": 0.43039198466815987, + "grad_norm": 8.439824104309082, + "learning_rate": 4.282680025553067e-05, + "loss": 3.0141, + "step": 1384500 + }, + { + "epoch": 0.43054741694864673, + "grad_norm": 8.365412712097168, + "learning_rate": 4.282420971752256e-05, + "loss": 2.9675, + "step": 1385000 + }, + { + "epoch": 0.4307028492291336, + "grad_norm": 12.95134449005127, + "learning_rate": 4.2821619179514446e-05, + "loss": 3.0217, + "step": 1385500 + }, + { + "epoch": 0.43085828150962047, + "grad_norm": 7.897010803222656, + "learning_rate": 4.2819028641506326e-05, + "loss": 2.9895, + "step": 1386000 + }, + { + "epoch": 0.43101371379010733, + "grad_norm": 9.497601509094238, + "learning_rate": 4.281643810349821e-05, + "loss": 3.0425, + "step": 1386500 + }, + { + "epoch": 0.43116914607059426, + "grad_norm": 41.611873626708984, + "learning_rate": 4.28138475654901e-05, + "loss": 3.0062, + "step": 1387000 + }, + { + "epoch": 0.4313245783510811, + "grad_norm": 11.129328727722168, + "learning_rate": 4.281125702748198e-05, + "loss": 3.0456, + "step": 1387500 + }, + { + "epoch": 0.431480010631568, + "grad_norm": 23.944120407104492, + "learning_rate": 4.280866648947387e-05, + "loss": 3.0358, + "step": 1388000 + }, + { + "epoch": 0.43163544291205486, + "grad_norm": 10.175239562988281, + "learning_rate": 4.280607595146575e-05, + "loss": 2.9677, + "step": 1388500 + }, + { + "epoch": 0.4317908751925417, + "grad_norm": 10.823430061340332, + "learning_rate": 4.280348541345764e-05, + "loss": 2.9888, + "step": 1389000 + }, + { + "epoch": 0.4319463074730286, + "grad_norm": 10.304635047912598, + "learning_rate": 4.280089487544953e-05, + "loss": 3.0106, + "step": 1389500 + }, + { + "epoch": 0.4321017397535155, + "grad_norm": 13.40346908569336, + "learning_rate": 4.279830433744141e-05, + "loss": 2.9875, + "step": 1390000 + }, + { + "epoch": 0.4322571720340024, + "grad_norm": 8.574790000915527, + "learning_rate": 4.27957137994333e-05, + "loss": 2.9691, + "step": 1390500 + }, + { + "epoch": 0.43241260431448925, + "grad_norm": 8.791261672973633, + "learning_rate": 4.2793123261425184e-05, + "loss": 3.058, + "step": 1391000 + }, + { + "epoch": 0.4325680365949761, + "grad_norm": 49.65424346923828, + "learning_rate": 4.2790532723417065e-05, + "loss": 3.0365, + "step": 1391500 + }, + { + "epoch": 0.432723468875463, + "grad_norm": 8.80201244354248, + "learning_rate": 4.278794218540895e-05, + "loss": 2.9855, + "step": 1392000 + }, + { + "epoch": 0.43287890115594985, + "grad_norm": 7.172414302825928, + "learning_rate": 4.278535164740084e-05, + "loss": 2.9901, + "step": 1392500 + }, + { + "epoch": 0.43303433343643677, + "grad_norm": 6.93745231628418, + "learning_rate": 4.278276110939272e-05, + "loss": 3.0233, + "step": 1393000 + }, + { + "epoch": 0.43318976571692364, + "grad_norm": 6.93948221206665, + "learning_rate": 4.2780170571384606e-05, + "loss": 3.018, + "step": 1393500 + }, + { + "epoch": 0.4333451979974105, + "grad_norm": 9.40650463104248, + "learning_rate": 4.2777580033376494e-05, + "loss": 2.9943, + "step": 1394000 + }, + { + "epoch": 0.43350063027789737, + "grad_norm": 9.720420837402344, + "learning_rate": 4.277498949536838e-05, + "loss": 2.9612, + "step": 1394500 + }, + { + "epoch": 0.43365606255838424, + "grad_norm": 9.036855697631836, + "learning_rate": 4.277239895736027e-05, + "loss": 2.9775, + "step": 1395000 + }, + { + "epoch": 0.4338114948388711, + "grad_norm": 17.271392822265625, + "learning_rate": 4.276980841935215e-05, + "loss": 3.0258, + "step": 1395500 + }, + { + "epoch": 0.433966927119358, + "grad_norm": 5.007867813110352, + "learning_rate": 4.2767217881344035e-05, + "loss": 3.0319, + "step": 1396000 + }, + { + "epoch": 0.4341223593998449, + "grad_norm": 14.442608833312988, + "learning_rate": 4.276462734333592e-05, + "loss": 3.0067, + "step": 1396500 + }, + { + "epoch": 0.43427779168033176, + "grad_norm": 8.106505393981934, + "learning_rate": 4.27620368053278e-05, + "loss": 3.0351, + "step": 1397000 + }, + { + "epoch": 0.4344332239608186, + "grad_norm": 7.17493200302124, + "learning_rate": 4.275944626731969e-05, + "loss": 3.0291, + "step": 1397500 + }, + { + "epoch": 0.4345886562413055, + "grad_norm": 7.476723670959473, + "learning_rate": 4.275685572931158e-05, + "loss": 2.9975, + "step": 1398000 + }, + { + "epoch": 0.43474408852179236, + "grad_norm": 6.630337238311768, + "learning_rate": 4.275426519130346e-05, + "loss": 3.0103, + "step": 1398500 + }, + { + "epoch": 0.4348995208022793, + "grad_norm": 7.442617416381836, + "learning_rate": 4.275167465329535e-05, + "loss": 3.0043, + "step": 1399000 + }, + { + "epoch": 0.43505495308276615, + "grad_norm": 9.103610038757324, + "learning_rate": 4.274908411528724e-05, + "loss": 3.0385, + "step": 1399500 + }, + { + "epoch": 0.435210385363253, + "grad_norm": 6.813909530639648, + "learning_rate": 4.274649357727912e-05, + "loss": 3.0033, + "step": 1400000 + }, + { + "epoch": 0.4353658176437399, + "grad_norm": 8.304441452026367, + "learning_rate": 4.2743903039271006e-05, + "loss": 3.0058, + "step": 1400500 + }, + { + "epoch": 0.43552124992422675, + "grad_norm": 7.535092830657959, + "learning_rate": 4.2741312501262887e-05, + "loss": 3.0552, + "step": 1401000 + }, + { + "epoch": 0.4356766822047136, + "grad_norm": 9.222039222717285, + "learning_rate": 4.2738721963254774e-05, + "loss": 2.9965, + "step": 1401500 + }, + { + "epoch": 0.43583211448520054, + "grad_norm": 8.259675025939941, + "learning_rate": 4.273613142524666e-05, + "loss": 3.0327, + "step": 1402000 + }, + { + "epoch": 0.4359875467656874, + "grad_norm": 9.526379585266113, + "learning_rate": 4.273354088723854e-05, + "loss": 3.0674, + "step": 1402500 + }, + { + "epoch": 0.4361429790461743, + "grad_norm": 10.605490684509277, + "learning_rate": 4.273095034923043e-05, + "loss": 2.9852, + "step": 1403000 + }, + { + "epoch": 0.43629841132666114, + "grad_norm": 6.377732753753662, + "learning_rate": 4.2728359811222316e-05, + "loss": 3.0243, + "step": 1403500 + }, + { + "epoch": 0.436453843607148, + "grad_norm": 8.191668510437012, + "learning_rate": 4.27257692732142e-05, + "loss": 2.9926, + "step": 1404000 + }, + { + "epoch": 0.4366092758876349, + "grad_norm": 7.817259311676025, + "learning_rate": 4.272317873520609e-05, + "loss": 3.0334, + "step": 1404500 + }, + { + "epoch": 0.4367647081681218, + "grad_norm": 8.507429122924805, + "learning_rate": 4.272058819719798e-05, + "loss": 3.0279, + "step": 1405000 + }, + { + "epoch": 0.43692014044860866, + "grad_norm": 8.747611999511719, + "learning_rate": 4.271799765918986e-05, + "loss": 2.9937, + "step": 1405500 + }, + { + "epoch": 0.43707557272909553, + "grad_norm": 7.000222682952881, + "learning_rate": 4.2715407121181745e-05, + "loss": 3.008, + "step": 1406000 + }, + { + "epoch": 0.4372310050095824, + "grad_norm": 17.81680679321289, + "learning_rate": 4.2712816583173625e-05, + "loss": 2.9918, + "step": 1406500 + }, + { + "epoch": 0.43738643729006926, + "grad_norm": 8.49725341796875, + "learning_rate": 4.271022604516551e-05, + "loss": 3.0131, + "step": 1407000 + }, + { + "epoch": 0.43754186957055613, + "grad_norm": 8.878352165222168, + "learning_rate": 4.27076355071574e-05, + "loss": 3.0203, + "step": 1407500 + }, + { + "epoch": 0.43769730185104305, + "grad_norm": 8.239829063415527, + "learning_rate": 4.270504496914928e-05, + "loss": 3.0236, + "step": 1408000 + }, + { + "epoch": 0.4378527341315299, + "grad_norm": 10.48766803741455, + "learning_rate": 4.270245443114117e-05, + "loss": 2.9821, + "step": 1408500 + }, + { + "epoch": 0.4380081664120168, + "grad_norm": 7.844772815704346, + "learning_rate": 4.269986389313306e-05, + "loss": 2.9688, + "step": 1409000 + }, + { + "epoch": 0.43816359869250365, + "grad_norm": 13.657275199890137, + "learning_rate": 4.269727335512494e-05, + "loss": 3.0055, + "step": 1409500 + }, + { + "epoch": 0.4383190309729905, + "grad_norm": 7.971479892730713, + "learning_rate": 4.269468281711683e-05, + "loss": 2.9833, + "step": 1410000 + }, + { + "epoch": 0.4384744632534774, + "grad_norm": 18.2299747467041, + "learning_rate": 4.2692092279108715e-05, + "loss": 3.0074, + "step": 1410500 + }, + { + "epoch": 0.4386298955339643, + "grad_norm": 7.354091644287109, + "learning_rate": 4.2689501741100596e-05, + "loss": 3.02, + "step": 1411000 + }, + { + "epoch": 0.4387853278144512, + "grad_norm": 21.20577621459961, + "learning_rate": 4.268691120309248e-05, + "loss": 2.9692, + "step": 1411500 + }, + { + "epoch": 0.43894076009493804, + "grad_norm": 30.489688873291016, + "learning_rate": 4.268432066508437e-05, + "loss": 2.9957, + "step": 1412000 + }, + { + "epoch": 0.4390961923754249, + "grad_norm": 8.517279624938965, + "learning_rate": 4.268173012707625e-05, + "loss": 2.9729, + "step": 1412500 + }, + { + "epoch": 0.4392516246559118, + "grad_norm": 8.421247482299805, + "learning_rate": 4.267913958906814e-05, + "loss": 2.9638, + "step": 1413000 + }, + { + "epoch": 0.43940705693639864, + "grad_norm": 9.642681121826172, + "learning_rate": 4.2676549051060025e-05, + "loss": 3.0173, + "step": 1413500 + }, + { + "epoch": 0.43956248921688557, + "grad_norm": 7.417707443237305, + "learning_rate": 4.267395851305191e-05, + "loss": 3.0017, + "step": 1414000 + }, + { + "epoch": 0.43971792149737243, + "grad_norm": 13.389802932739258, + "learning_rate": 4.26713679750438e-05, + "loss": 3.0332, + "step": 1414500 + }, + { + "epoch": 0.4398733537778593, + "grad_norm": 6.955569744110107, + "learning_rate": 4.266877743703568e-05, + "loss": 2.9794, + "step": 1415000 + }, + { + "epoch": 0.44002878605834617, + "grad_norm": 7.868232250213623, + "learning_rate": 4.2666186899027567e-05, + "loss": 3.0054, + "step": 1415500 + }, + { + "epoch": 0.44018421833883303, + "grad_norm": 8.779460906982422, + "learning_rate": 4.2663596361019454e-05, + "loss": 2.9648, + "step": 1416000 + }, + { + "epoch": 0.4403396506193199, + "grad_norm": 8.199538230895996, + "learning_rate": 4.2661005823011334e-05, + "loss": 2.957, + "step": 1416500 + }, + { + "epoch": 0.4404950828998068, + "grad_norm": 7.760096073150635, + "learning_rate": 4.265841528500322e-05, + "loss": 2.9616, + "step": 1417000 + }, + { + "epoch": 0.4406505151802937, + "grad_norm": 12.223222732543945, + "learning_rate": 4.265582474699511e-05, + "loss": 2.9965, + "step": 1417500 + }, + { + "epoch": 0.44080594746078056, + "grad_norm": 11.769410133361816, + "learning_rate": 4.265323420898699e-05, + "loss": 3.0413, + "step": 1418000 + }, + { + "epoch": 0.4409613797412674, + "grad_norm": 8.940576553344727, + "learning_rate": 4.2650643670978876e-05, + "loss": 3.0347, + "step": 1418500 + }, + { + "epoch": 0.4411168120217543, + "grad_norm": 9.177142143249512, + "learning_rate": 4.264805313297076e-05, + "loss": 3.0287, + "step": 1419000 + }, + { + "epoch": 0.44127224430224116, + "grad_norm": 7.187388896942139, + "learning_rate": 4.264546259496265e-05, + "loss": 3.0026, + "step": 1419500 + }, + { + "epoch": 0.4414276765827281, + "grad_norm": 6.980782985687256, + "learning_rate": 4.264287205695454e-05, + "loss": 3.0091, + "step": 1420000 + }, + { + "epoch": 0.44158310886321495, + "grad_norm": 9.781368255615234, + "learning_rate": 4.264028151894642e-05, + "loss": 3.0211, + "step": 1420500 + }, + { + "epoch": 0.4417385411437018, + "grad_norm": 8.474227905273438, + "learning_rate": 4.2637690980938305e-05, + "loss": 2.9951, + "step": 1421000 + }, + { + "epoch": 0.4418939734241887, + "grad_norm": 8.533623695373535, + "learning_rate": 4.263510044293019e-05, + "loss": 2.9775, + "step": 1421500 + }, + { + "epoch": 0.44204940570467555, + "grad_norm": 8.354305267333984, + "learning_rate": 4.263250990492207e-05, + "loss": 3.0116, + "step": 1422000 + }, + { + "epoch": 0.4422048379851624, + "grad_norm": 7.202996253967285, + "learning_rate": 4.262991936691396e-05, + "loss": 2.984, + "step": 1422500 + }, + { + "epoch": 0.44236027026564934, + "grad_norm": 18.72014808654785, + "learning_rate": 4.262732882890585e-05, + "loss": 3.0008, + "step": 1423000 + }, + { + "epoch": 0.4425157025461362, + "grad_norm": 8.901729583740234, + "learning_rate": 4.2624738290897734e-05, + "loss": 3.0012, + "step": 1423500 + }, + { + "epoch": 0.44267113482662307, + "grad_norm": 7.418874740600586, + "learning_rate": 4.262214775288962e-05, + "loss": 3.0352, + "step": 1424000 + }, + { + "epoch": 0.44282656710710994, + "grad_norm": 8.352479934692383, + "learning_rate": 4.26195572148815e-05, + "loss": 2.9882, + "step": 1424500 + }, + { + "epoch": 0.4429819993875968, + "grad_norm": 7.526435375213623, + "learning_rate": 4.261696667687339e-05, + "loss": 2.9742, + "step": 1425000 + }, + { + "epoch": 0.44313743166808367, + "grad_norm": 7.421649932861328, + "learning_rate": 4.2614376138865276e-05, + "loss": 3.0165, + "step": 1425500 + }, + { + "epoch": 0.4432928639485706, + "grad_norm": 18.567626953125, + "learning_rate": 4.2611785600857156e-05, + "loss": 3.0302, + "step": 1426000 + }, + { + "epoch": 0.44344829622905746, + "grad_norm": 9.243324279785156, + "learning_rate": 4.260919506284904e-05, + "loss": 2.9917, + "step": 1426500 + }, + { + "epoch": 0.4436037285095443, + "grad_norm": 9.93535041809082, + "learning_rate": 4.260660452484093e-05, + "loss": 2.9919, + "step": 1427000 + }, + { + "epoch": 0.4437591607900312, + "grad_norm": 8.226630210876465, + "learning_rate": 4.260401398683281e-05, + "loss": 3.0139, + "step": 1427500 + }, + { + "epoch": 0.44391459307051806, + "grad_norm": 9.631587982177734, + "learning_rate": 4.26014234488247e-05, + "loss": 3.0406, + "step": 1428000 + }, + { + "epoch": 0.4440700253510049, + "grad_norm": 18.008869171142578, + "learning_rate": 4.2598832910816585e-05, + "loss": 3.0393, + "step": 1428500 + }, + { + "epoch": 0.44422545763149185, + "grad_norm": 8.41627311706543, + "learning_rate": 4.259624237280847e-05, + "loss": 2.9945, + "step": 1429000 + }, + { + "epoch": 0.4443808899119787, + "grad_norm": 8.259761810302734, + "learning_rate": 4.259365183480036e-05, + "loss": 3.0316, + "step": 1429500 + }, + { + "epoch": 0.4445363221924656, + "grad_norm": 10.21623706817627, + "learning_rate": 4.2591061296792247e-05, + "loss": 3.0412, + "step": 1430000 + }, + { + "epoch": 0.44469175447295245, + "grad_norm": 10.41943359375, + "learning_rate": 4.258847075878413e-05, + "loss": 3.0102, + "step": 1430500 + }, + { + "epoch": 0.4448471867534393, + "grad_norm": 10.94701862335205, + "learning_rate": 4.2585880220776014e-05, + "loss": 2.9836, + "step": 1431000 + }, + { + "epoch": 0.4450026190339262, + "grad_norm": 7.594933986663818, + "learning_rate": 4.2583289682767894e-05, + "loss": 3.0018, + "step": 1431500 + }, + { + "epoch": 0.4451580513144131, + "grad_norm": 10.222610473632812, + "learning_rate": 4.258069914475978e-05, + "loss": 2.9742, + "step": 1432000 + }, + { + "epoch": 0.4453134835949, + "grad_norm": 6.597044944763184, + "learning_rate": 4.257810860675167e-05, + "loss": 3.053, + "step": 1432500 + }, + { + "epoch": 0.44546891587538684, + "grad_norm": 8.264097213745117, + "learning_rate": 4.2575518068743556e-05, + "loss": 2.9801, + "step": 1433000 + }, + { + "epoch": 0.4456243481558737, + "grad_norm": 17.9926700592041, + "learning_rate": 4.257292753073544e-05, + "loss": 3.0029, + "step": 1433500 + }, + { + "epoch": 0.4457797804363606, + "grad_norm": 7.994649410247803, + "learning_rate": 4.257033699272733e-05, + "loss": 2.9106, + "step": 1434000 + }, + { + "epoch": 0.44593521271684744, + "grad_norm": 7.981579780578613, + "learning_rate": 4.256774645471921e-05, + "loss": 2.9997, + "step": 1434500 + }, + { + "epoch": 0.44609064499733436, + "grad_norm": 6.69456148147583, + "learning_rate": 4.25651559167111e-05, + "loss": 3.0435, + "step": 1435000 + }, + { + "epoch": 0.44624607727782123, + "grad_norm": 6.715418338775635, + "learning_rate": 4.2562565378702985e-05, + "loss": 2.9949, + "step": 1435500 + }, + { + "epoch": 0.4464015095583081, + "grad_norm": 6.576136112213135, + "learning_rate": 4.2559974840694865e-05, + "loss": 3.0285, + "step": 1436000 + }, + { + "epoch": 0.44655694183879496, + "grad_norm": 27.421283721923828, + "learning_rate": 4.255738430268675e-05, + "loss": 2.9935, + "step": 1436500 + }, + { + "epoch": 0.44671237411928183, + "grad_norm": 8.821805953979492, + "learning_rate": 4.255479376467863e-05, + "loss": 3.0407, + "step": 1437000 + }, + { + "epoch": 0.4468678063997687, + "grad_norm": 7.051980495452881, + "learning_rate": 4.255220322667052e-05, + "loss": 3.0186, + "step": 1437500 + }, + { + "epoch": 0.4470232386802556, + "grad_norm": 10.933259963989258, + "learning_rate": 4.254961268866241e-05, + "loss": 3.0262, + "step": 1438000 + }, + { + "epoch": 0.4471786709607425, + "grad_norm": 9.587018966674805, + "learning_rate": 4.2547022150654294e-05, + "loss": 3.0172, + "step": 1438500 + }, + { + "epoch": 0.44733410324122935, + "grad_norm": 12.38866901397705, + "learning_rate": 4.254443161264618e-05, + "loss": 2.9972, + "step": 1439000 + }, + { + "epoch": 0.4474895355217162, + "grad_norm": 6.506922721862793, + "learning_rate": 4.254184107463807e-05, + "loss": 2.9787, + "step": 1439500 + }, + { + "epoch": 0.4476449678022031, + "grad_norm": 11.52219295501709, + "learning_rate": 4.253925053662995e-05, + "loss": 2.976, + "step": 1440000 + }, + { + "epoch": 0.44780040008268995, + "grad_norm": 17.15482521057129, + "learning_rate": 4.2536659998621836e-05, + "loss": 2.9935, + "step": 1440500 + }, + { + "epoch": 0.4479558323631769, + "grad_norm": 8.258350372314453, + "learning_rate": 4.253406946061372e-05, + "loss": 3.0391, + "step": 1441000 + }, + { + "epoch": 0.44811126464366374, + "grad_norm": 13.912190437316895, + "learning_rate": 4.2531478922605604e-05, + "loss": 3.0186, + "step": 1441500 + }, + { + "epoch": 0.4482666969241506, + "grad_norm": 8.346692085266113, + "learning_rate": 4.252888838459749e-05, + "loss": 2.9956, + "step": 1442000 + }, + { + "epoch": 0.4484221292046375, + "grad_norm": 8.994848251342773, + "learning_rate": 4.252629784658938e-05, + "loss": 3.0192, + "step": 1442500 + }, + { + "epoch": 0.44857756148512434, + "grad_norm": 7.42503023147583, + "learning_rate": 4.2523707308581265e-05, + "loss": 3.0043, + "step": 1443000 + }, + { + "epoch": 0.4487329937656112, + "grad_norm": 9.137506484985352, + "learning_rate": 4.252111677057315e-05, + "loss": 2.9802, + "step": 1443500 + }, + { + "epoch": 0.44888842604609813, + "grad_norm": 9.542128562927246, + "learning_rate": 4.251852623256503e-05, + "loss": 2.9988, + "step": 1444000 + }, + { + "epoch": 0.449043858326585, + "grad_norm": 60.539581298828125, + "learning_rate": 4.251593569455692e-05, + "loss": 3.0148, + "step": 1444500 + }, + { + "epoch": 0.44919929060707187, + "grad_norm": 8.762730598449707, + "learning_rate": 4.251334515654881e-05, + "loss": 2.9718, + "step": 1445000 + }, + { + "epoch": 0.44935472288755873, + "grad_norm": 9.078280448913574, + "learning_rate": 4.251075461854069e-05, + "loss": 3.0431, + "step": 1445500 + }, + { + "epoch": 0.4495101551680456, + "grad_norm": 6.732937812805176, + "learning_rate": 4.2508164080532575e-05, + "loss": 2.979, + "step": 1446000 + }, + { + "epoch": 0.44966558744853247, + "grad_norm": 9.385017395019531, + "learning_rate": 4.250557354252446e-05, + "loss": 2.952, + "step": 1446500 + }, + { + "epoch": 0.4498210197290194, + "grad_norm": 7.160591125488281, + "learning_rate": 4.250298300451634e-05, + "loss": 2.9788, + "step": 1447000 + }, + { + "epoch": 0.44997645200950626, + "grad_norm": 6.983069896697998, + "learning_rate": 4.250039246650823e-05, + "loss": 3.0281, + "step": 1447500 + }, + { + "epoch": 0.4501318842899931, + "grad_norm": 7.314948558807373, + "learning_rate": 4.2497801928500116e-05, + "loss": 2.9971, + "step": 1448000 + }, + { + "epoch": 0.45028731657048, + "grad_norm": 7.656804084777832, + "learning_rate": 4.2495211390492003e-05, + "loss": 3.0073, + "step": 1448500 + }, + { + "epoch": 0.45044274885096686, + "grad_norm": 5.40187931060791, + "learning_rate": 4.249262085248389e-05, + "loss": 3.0286, + "step": 1449000 + }, + { + "epoch": 0.4505981811314537, + "grad_norm": 24.832868576049805, + "learning_rate": 4.249003031447577e-05, + "loss": 2.9919, + "step": 1449500 + }, + { + "epoch": 0.45075361341194065, + "grad_norm": 22.462081909179688, + "learning_rate": 4.248743977646766e-05, + "loss": 3.0519, + "step": 1450000 + }, + { + "epoch": 0.4509090456924275, + "grad_norm": 8.079568862915039, + "learning_rate": 4.2484849238459545e-05, + "loss": 3.0095, + "step": 1450500 + }, + { + "epoch": 0.4510644779729144, + "grad_norm": 6.018609523773193, + "learning_rate": 4.2482258700451426e-05, + "loss": 2.9821, + "step": 1451000 + }, + { + "epoch": 0.45121991025340125, + "grad_norm": 7.2614850997924805, + "learning_rate": 4.247966816244331e-05, + "loss": 3.0174, + "step": 1451500 + }, + { + "epoch": 0.4513753425338881, + "grad_norm": 9.855159759521484, + "learning_rate": 4.24770776244352e-05, + "loss": 3.0051, + "step": 1452000 + }, + { + "epoch": 0.451530774814375, + "grad_norm": 4.962505340576172, + "learning_rate": 4.247448708642709e-05, + "loss": 2.9889, + "step": 1452500 + }, + { + "epoch": 0.4516862070948619, + "grad_norm": 10.051335334777832, + "learning_rate": 4.2471896548418974e-05, + "loss": 3.0197, + "step": 1453000 + }, + { + "epoch": 0.45184163937534877, + "grad_norm": 9.899216651916504, + "learning_rate": 4.246930601041086e-05, + "loss": 3.0457, + "step": 1453500 + }, + { + "epoch": 0.45199707165583564, + "grad_norm": 9.416171073913574, + "learning_rate": 4.246671547240274e-05, + "loss": 3.0075, + "step": 1454000 + }, + { + "epoch": 0.4521525039363225, + "grad_norm": 11.051389694213867, + "learning_rate": 4.246412493439463e-05, + "loss": 2.9937, + "step": 1454500 + }, + { + "epoch": 0.45230793621680937, + "grad_norm": 10.562308311462402, + "learning_rate": 4.246153439638651e-05, + "loss": 3.0213, + "step": 1455000 + }, + { + "epoch": 0.45246336849729624, + "grad_norm": 15.106437683105469, + "learning_rate": 4.2458943858378397e-05, + "loss": 2.9779, + "step": 1455500 + }, + { + "epoch": 0.45261880077778316, + "grad_norm": 8.569409370422363, + "learning_rate": 4.2456353320370284e-05, + "loss": 3.0096, + "step": 1456000 + }, + { + "epoch": 0.45277423305827, + "grad_norm": 8.530065536499023, + "learning_rate": 4.2453762782362164e-05, + "loss": 3.0351, + "step": 1456500 + }, + { + "epoch": 0.4529296653387569, + "grad_norm": 8.65596866607666, + "learning_rate": 4.245117224435405e-05, + "loss": 3.0048, + "step": 1457000 + }, + { + "epoch": 0.45308509761924376, + "grad_norm": 6.975749492645264, + "learning_rate": 4.244858170634594e-05, + "loss": 3.0347, + "step": 1457500 + }, + { + "epoch": 0.4532405298997306, + "grad_norm": 9.31984806060791, + "learning_rate": 4.2445991168337826e-05, + "loss": 2.9649, + "step": 1458000 + }, + { + "epoch": 0.4533959621802175, + "grad_norm": 9.84131908416748, + "learning_rate": 4.244340063032971e-05, + "loss": 3.0394, + "step": 1458500 + }, + { + "epoch": 0.4535513944607044, + "grad_norm": 8.005767822265625, + "learning_rate": 4.24408100923216e-05, + "loss": 2.9878, + "step": 1459000 + }, + { + "epoch": 0.4537068267411913, + "grad_norm": 7.764986515045166, + "learning_rate": 4.243821955431348e-05, + "loss": 3.0193, + "step": 1459500 + }, + { + "epoch": 0.45386225902167815, + "grad_norm": 6.134619235992432, + "learning_rate": 4.243562901630537e-05, + "loss": 3.0117, + "step": 1460000 + }, + { + "epoch": 0.454017691302165, + "grad_norm": 9.1181640625, + "learning_rate": 4.2433038478297255e-05, + "loss": 2.9812, + "step": 1460500 + }, + { + "epoch": 0.4541731235826519, + "grad_norm": 9.423063278198242, + "learning_rate": 4.2430447940289135e-05, + "loss": 2.9893, + "step": 1461000 + }, + { + "epoch": 0.45432855586313875, + "grad_norm": 8.423585891723633, + "learning_rate": 4.242785740228102e-05, + "loss": 3.0011, + "step": 1461500 + }, + { + "epoch": 0.4544839881436256, + "grad_norm": 7.900973796844482, + "learning_rate": 4.24252668642729e-05, + "loss": 2.9529, + "step": 1462000 + }, + { + "epoch": 0.45463942042411254, + "grad_norm": 9.245067596435547, + "learning_rate": 4.2422676326264796e-05, + "loss": 3.0051, + "step": 1462500 + }, + { + "epoch": 0.4547948527045994, + "grad_norm": 7.960526466369629, + "learning_rate": 4.2420085788256684e-05, + "loss": 3.0204, + "step": 1463000 + }, + { + "epoch": 0.4549502849850863, + "grad_norm": 8.956145286560059, + "learning_rate": 4.2417495250248564e-05, + "loss": 3.0032, + "step": 1463500 + }, + { + "epoch": 0.45510571726557314, + "grad_norm": 7.965908527374268, + "learning_rate": 4.241490471224045e-05, + "loss": 3.001, + "step": 1464000 + }, + { + "epoch": 0.45526114954606, + "grad_norm": 10.481865882873535, + "learning_rate": 4.241231417423234e-05, + "loss": 2.9777, + "step": 1464500 + }, + { + "epoch": 0.4554165818265469, + "grad_norm": 7.602447986602783, + "learning_rate": 4.240972363622422e-05, + "loss": 2.9409, + "step": 1465000 + }, + { + "epoch": 0.4555720141070338, + "grad_norm": 9.708548545837402, + "learning_rate": 4.2407133098216106e-05, + "loss": 3.0113, + "step": 1465500 + }, + { + "epoch": 0.45572744638752066, + "grad_norm": 9.152201652526855, + "learning_rate": 4.240454256020799e-05, + "loss": 3.007, + "step": 1466000 + }, + { + "epoch": 0.45588287866800753, + "grad_norm": 9.478516578674316, + "learning_rate": 4.240195202219987e-05, + "loss": 3.0226, + "step": 1466500 + }, + { + "epoch": 0.4560383109484944, + "grad_norm": 8.128018379211426, + "learning_rate": 4.239936148419176e-05, + "loss": 3.0162, + "step": 1467000 + }, + { + "epoch": 0.45619374322898126, + "grad_norm": 7.717082977294922, + "learning_rate": 4.239677094618365e-05, + "loss": 2.9837, + "step": 1467500 + }, + { + "epoch": 0.45634917550946813, + "grad_norm": 7.780921936035156, + "learning_rate": 4.2394180408175535e-05, + "loss": 2.9878, + "step": 1468000 + }, + { + "epoch": 0.45650460778995505, + "grad_norm": 56.645599365234375, + "learning_rate": 4.239158987016742e-05, + "loss": 3.0349, + "step": 1468500 + }, + { + "epoch": 0.4566600400704419, + "grad_norm": 5.329046726226807, + "learning_rate": 4.23889993321593e-05, + "loss": 2.9285, + "step": 1469000 + }, + { + "epoch": 0.4568154723509288, + "grad_norm": 7.378773212432861, + "learning_rate": 4.238640879415119e-05, + "loss": 2.9923, + "step": 1469500 + }, + { + "epoch": 0.45697090463141565, + "grad_norm": 6.71425199508667, + "learning_rate": 4.2383818256143077e-05, + "loss": 3.0341, + "step": 1470000 + }, + { + "epoch": 0.4571263369119025, + "grad_norm": 9.840527534484863, + "learning_rate": 4.238122771813496e-05, + "loss": 2.9777, + "step": 1470500 + }, + { + "epoch": 0.4572817691923894, + "grad_norm": 10.87326717376709, + "learning_rate": 4.2378637180126844e-05, + "loss": 3.0114, + "step": 1471000 + }, + { + "epoch": 0.4574372014728763, + "grad_norm": 13.323182106018066, + "learning_rate": 4.237604664211873e-05, + "loss": 3.0004, + "step": 1471500 + }, + { + "epoch": 0.4575926337533632, + "grad_norm": 7.680633544921875, + "learning_rate": 4.237345610411061e-05, + "loss": 2.9887, + "step": 1472000 + }, + { + "epoch": 0.45774806603385004, + "grad_norm": 7.863221168518066, + "learning_rate": 4.2370865566102506e-05, + "loss": 3.0353, + "step": 1472500 + }, + { + "epoch": 0.4579034983143369, + "grad_norm": 7.460826873779297, + "learning_rate": 4.2368275028094386e-05, + "loss": 3.0228, + "step": 1473000 + }, + { + "epoch": 0.4580589305948238, + "grad_norm": 15.249360084533691, + "learning_rate": 4.236568449008627e-05, + "loss": 2.9963, + "step": 1473500 + }, + { + "epoch": 0.45821436287531064, + "grad_norm": 8.275860786437988, + "learning_rate": 4.236309395207816e-05, + "loss": 3.0507, + "step": 1474000 + }, + { + "epoch": 0.45836979515579757, + "grad_norm": 9.312052726745605, + "learning_rate": 4.236050341407004e-05, + "loss": 2.9883, + "step": 1474500 + }, + { + "epoch": 0.45852522743628443, + "grad_norm": 10.195779800415039, + "learning_rate": 4.235791287606193e-05, + "loss": 3.0251, + "step": 1475000 + }, + { + "epoch": 0.4586806597167713, + "grad_norm": 21.333539962768555, + "learning_rate": 4.2355322338053815e-05, + "loss": 2.9875, + "step": 1475500 + }, + { + "epoch": 0.45883609199725817, + "grad_norm": 6.168112754821777, + "learning_rate": 4.2352731800045695e-05, + "loss": 2.9807, + "step": 1476000 + }, + { + "epoch": 0.45899152427774503, + "grad_norm": 22.11046600341797, + "learning_rate": 4.235014126203758e-05, + "loss": 2.9873, + "step": 1476500 + }, + { + "epoch": 0.4591469565582319, + "grad_norm": 8.431046485900879, + "learning_rate": 4.234755072402947e-05, + "loss": 2.9977, + "step": 1477000 + }, + { + "epoch": 0.4593023888387188, + "grad_norm": 7.752357006072998, + "learning_rate": 4.234496018602136e-05, + "loss": 3.0024, + "step": 1477500 + }, + { + "epoch": 0.4594578211192057, + "grad_norm": 6.198019981384277, + "learning_rate": 4.2342369648013244e-05, + "loss": 2.9812, + "step": 1478000 + }, + { + "epoch": 0.45961325339969256, + "grad_norm": 10.855496406555176, + "learning_rate": 4.233977911000513e-05, + "loss": 2.9838, + "step": 1478500 + }, + { + "epoch": 0.4597686856801794, + "grad_norm": 9.94638442993164, + "learning_rate": 4.233718857199701e-05, + "loss": 3.0418, + "step": 1479000 + }, + { + "epoch": 0.4599241179606663, + "grad_norm": 25.15116310119629, + "learning_rate": 4.23345980339889e-05, + "loss": 3.0047, + "step": 1479500 + }, + { + "epoch": 0.46007955024115316, + "grad_norm": 8.264490127563477, + "learning_rate": 4.233200749598078e-05, + "loss": 3.0015, + "step": 1480000 + }, + { + "epoch": 0.4602349825216401, + "grad_norm": 8.947308540344238, + "learning_rate": 4.2329416957972666e-05, + "loss": 2.9926, + "step": 1480500 + }, + { + "epoch": 0.46039041480212695, + "grad_norm": 8.052371978759766, + "learning_rate": 4.232682641996455e-05, + "loss": 2.9754, + "step": 1481000 + }, + { + "epoch": 0.4605458470826138, + "grad_norm": 9.157440185546875, + "learning_rate": 4.2324235881956434e-05, + "loss": 3.0032, + "step": 1481500 + }, + { + "epoch": 0.4607012793631007, + "grad_norm": 9.025094032287598, + "learning_rate": 4.232164534394832e-05, + "loss": 2.9863, + "step": 1482000 + }, + { + "epoch": 0.46085671164358755, + "grad_norm": 9.184708595275879, + "learning_rate": 4.2319054805940215e-05, + "loss": 2.9739, + "step": 1482500 + }, + { + "epoch": 0.4610121439240744, + "grad_norm": 9.460906028747559, + "learning_rate": 4.2316464267932095e-05, + "loss": 3.0825, + "step": 1483000 + }, + { + "epoch": 0.46116757620456134, + "grad_norm": 8.015843391418457, + "learning_rate": 4.231387372992398e-05, + "loss": 2.9833, + "step": 1483500 + }, + { + "epoch": 0.4613230084850482, + "grad_norm": 7.609298229217529, + "learning_rate": 4.231128319191587e-05, + "loss": 3.0582, + "step": 1484000 + }, + { + "epoch": 0.46147844076553507, + "grad_norm": 7.041804790496826, + "learning_rate": 4.230869265390775e-05, + "loss": 2.998, + "step": 1484500 + }, + { + "epoch": 0.46163387304602194, + "grad_norm": 17.518718719482422, + "learning_rate": 4.230610211589964e-05, + "loss": 3.0101, + "step": 1485000 + }, + { + "epoch": 0.4617893053265088, + "grad_norm": 12.145074844360352, + "learning_rate": 4.230351157789152e-05, + "loss": 3.0204, + "step": 1485500 + }, + { + "epoch": 0.46194473760699567, + "grad_norm": 6.353680610656738, + "learning_rate": 4.2300921039883404e-05, + "loss": 2.9784, + "step": 1486000 + }, + { + "epoch": 0.4621001698874826, + "grad_norm": 6.913758277893066, + "learning_rate": 4.229833050187529e-05, + "loss": 3.0071, + "step": 1486500 + }, + { + "epoch": 0.46225560216796946, + "grad_norm": 7.297600746154785, + "learning_rate": 4.229573996386718e-05, + "loss": 3.0651, + "step": 1487000 + }, + { + "epoch": 0.4624110344484563, + "grad_norm": 11.413549423217773, + "learning_rate": 4.2293149425859066e-05, + "loss": 3.0273, + "step": 1487500 + }, + { + "epoch": 0.4625664667289432, + "grad_norm": 13.494542121887207, + "learning_rate": 4.229055888785095e-05, + "loss": 2.9513, + "step": 1488000 + }, + { + "epoch": 0.46272189900943006, + "grad_norm": 11.361411094665527, + "learning_rate": 4.2287968349842833e-05, + "loss": 2.9684, + "step": 1488500 + }, + { + "epoch": 0.4628773312899169, + "grad_norm": 8.555718421936035, + "learning_rate": 4.228537781183472e-05, + "loss": 3.0255, + "step": 1489000 + }, + { + "epoch": 0.46303276357040385, + "grad_norm": 7.069640636444092, + "learning_rate": 4.228278727382661e-05, + "loss": 2.995, + "step": 1489500 + }, + { + "epoch": 0.4631881958508907, + "grad_norm": 8.27678394317627, + "learning_rate": 4.228019673581849e-05, + "loss": 3.0136, + "step": 1490000 + }, + { + "epoch": 0.4633436281313776, + "grad_norm": 9.418810844421387, + "learning_rate": 4.2277606197810375e-05, + "loss": 3.0438, + "step": 1490500 + }, + { + "epoch": 0.46349906041186445, + "grad_norm": 7.659292221069336, + "learning_rate": 4.2275015659802256e-05, + "loss": 2.9865, + "step": 1491000 + }, + { + "epoch": 0.4636544926923513, + "grad_norm": 8.854900360107422, + "learning_rate": 4.227242512179414e-05, + "loss": 3.0379, + "step": 1491500 + }, + { + "epoch": 0.4638099249728382, + "grad_norm": 10.208396911621094, + "learning_rate": 4.226983458378603e-05, + "loss": 3.0191, + "step": 1492000 + }, + { + "epoch": 0.4639653572533251, + "grad_norm": 7.2024126052856445, + "learning_rate": 4.226724404577792e-05, + "loss": 2.9904, + "step": 1492500 + }, + { + "epoch": 0.464120789533812, + "grad_norm": 4.777499675750732, + "learning_rate": 4.2264653507769804e-05, + "loss": 2.9902, + "step": 1493000 + }, + { + "epoch": 0.46427622181429884, + "grad_norm": 9.344300270080566, + "learning_rate": 4.226206296976169e-05, + "loss": 2.9864, + "step": 1493500 + }, + { + "epoch": 0.4644316540947857, + "grad_norm": 8.164283752441406, + "learning_rate": 4.225947243175357e-05, + "loss": 2.9879, + "step": 1494000 + }, + { + "epoch": 0.4645870863752726, + "grad_norm": 8.797891616821289, + "learning_rate": 4.225688189374546e-05, + "loss": 3.0235, + "step": 1494500 + }, + { + "epoch": 0.46474251865575944, + "grad_norm": 6.737269401550293, + "learning_rate": 4.2254291355737346e-05, + "loss": 3.0091, + "step": 1495000 + }, + { + "epoch": 0.46489795093624636, + "grad_norm": 8.673149108886719, + "learning_rate": 4.2251700817729227e-05, + "loss": 2.9683, + "step": 1495500 + }, + { + "epoch": 0.46505338321673323, + "grad_norm": 8.45002269744873, + "learning_rate": 4.2249110279721114e-05, + "loss": 3.0284, + "step": 1496000 + }, + { + "epoch": 0.4652088154972201, + "grad_norm": 6.983867645263672, + "learning_rate": 4.2246519741713e-05, + "loss": 3.0098, + "step": 1496500 + }, + { + "epoch": 0.46536424777770696, + "grad_norm": 8.020002365112305, + "learning_rate": 4.224392920370489e-05, + "loss": 3.0177, + "step": 1497000 + }, + { + "epoch": 0.46551968005819383, + "grad_norm": 7.587404727935791, + "learning_rate": 4.2241338665696775e-05, + "loss": 3.0232, + "step": 1497500 + }, + { + "epoch": 0.4656751123386807, + "grad_norm": 7.880170822143555, + "learning_rate": 4.2238748127688655e-05, + "loss": 2.9931, + "step": 1498000 + }, + { + "epoch": 0.4658305446191676, + "grad_norm": 15.621661186218262, + "learning_rate": 4.223615758968054e-05, + "loss": 2.959, + "step": 1498500 + }, + { + "epoch": 0.4659859768996545, + "grad_norm": 9.32766342163086, + "learning_rate": 4.223356705167243e-05, + "loss": 2.9852, + "step": 1499000 + }, + { + "epoch": 0.46614140918014135, + "grad_norm": 18.60662269592285, + "learning_rate": 4.223097651366431e-05, + "loss": 3.0102, + "step": 1499500 + }, + { + "epoch": 0.4662968414606282, + "grad_norm": 8.809861183166504, + "learning_rate": 4.22283859756562e-05, + "loss": 3.0196, + "step": 1500000 + }, + { + "epoch": 0.4664522737411151, + "grad_norm": 8.342860221862793, + "learning_rate": 4.2225795437648084e-05, + "loss": 3.0392, + "step": 1500500 + }, + { + "epoch": 0.46660770602160195, + "grad_norm": 9.577779769897461, + "learning_rate": 4.2223204899639965e-05, + "loss": 3.0059, + "step": 1501000 + }, + { + "epoch": 0.4667631383020889, + "grad_norm": 7.596067905426025, + "learning_rate": 4.222061436163185e-05, + "loss": 3.0211, + "step": 1501500 + }, + { + "epoch": 0.46691857058257574, + "grad_norm": 9.003830909729004, + "learning_rate": 4.221802382362374e-05, + "loss": 3.0792, + "step": 1502000 + }, + { + "epoch": 0.4670740028630626, + "grad_norm": 10.220355033874512, + "learning_rate": 4.2215433285615626e-05, + "loss": 2.9688, + "step": 1502500 + }, + { + "epoch": 0.4672294351435495, + "grad_norm": 18.652681350708008, + "learning_rate": 4.2212842747607513e-05, + "loss": 2.9788, + "step": 1503000 + }, + { + "epoch": 0.46738486742403634, + "grad_norm": 7.964770793914795, + "learning_rate": 4.2210252209599394e-05, + "loss": 3.0195, + "step": 1503500 + }, + { + "epoch": 0.4675402997045232, + "grad_norm": 7.544254779815674, + "learning_rate": 4.220766167159128e-05, + "loss": 3.0591, + "step": 1504000 + }, + { + "epoch": 0.46769573198501013, + "grad_norm": 7.850869178771973, + "learning_rate": 4.220507113358317e-05, + "loss": 2.975, + "step": 1504500 + }, + { + "epoch": 0.467851164265497, + "grad_norm": 8.948171615600586, + "learning_rate": 4.220248059557505e-05, + "loss": 3.0124, + "step": 1505000 + }, + { + "epoch": 0.46800659654598387, + "grad_norm": 6.246691703796387, + "learning_rate": 4.2199890057566936e-05, + "loss": 3.0127, + "step": 1505500 + }, + { + "epoch": 0.46816202882647073, + "grad_norm": 7.6469855308532715, + "learning_rate": 4.219729951955882e-05, + "loss": 3.0164, + "step": 1506000 + }, + { + "epoch": 0.4683174611069576, + "grad_norm": 8.55682373046875, + "learning_rate": 4.219470898155071e-05, + "loss": 3.0012, + "step": 1506500 + }, + { + "epoch": 0.46847289338744447, + "grad_norm": 14.423203468322754, + "learning_rate": 4.21921184435426e-05, + "loss": 2.9423, + "step": 1507000 + }, + { + "epoch": 0.4686283256679314, + "grad_norm": 10.178650856018066, + "learning_rate": 4.2189527905534484e-05, + "loss": 2.9989, + "step": 1507500 + }, + { + "epoch": 0.46878375794841826, + "grad_norm": 8.897357940673828, + "learning_rate": 4.2186937367526365e-05, + "loss": 2.974, + "step": 1508000 + }, + { + "epoch": 0.4689391902289051, + "grad_norm": 12.54212474822998, + "learning_rate": 4.218434682951825e-05, + "loss": 2.9973, + "step": 1508500 + }, + { + "epoch": 0.469094622509392, + "grad_norm": 10.598551750183105, + "learning_rate": 4.218175629151013e-05, + "loss": 3.0126, + "step": 1509000 + }, + { + "epoch": 0.46925005478987886, + "grad_norm": 6.777747631072998, + "learning_rate": 4.217916575350202e-05, + "loss": 2.9657, + "step": 1509500 + }, + { + "epoch": 0.4694054870703657, + "grad_norm": 6.931455612182617, + "learning_rate": 4.2176575215493907e-05, + "loss": 2.9727, + "step": 1510000 + }, + { + "epoch": 0.46956091935085265, + "grad_norm": 12.807791709899902, + "learning_rate": 4.217398467748579e-05, + "loss": 2.9765, + "step": 1510500 + }, + { + "epoch": 0.4697163516313395, + "grad_norm": 10.243975639343262, + "learning_rate": 4.2171394139477674e-05, + "loss": 2.9958, + "step": 1511000 + }, + { + "epoch": 0.4698717839118264, + "grad_norm": 8.814759254455566, + "learning_rate": 4.216880360146956e-05, + "loss": 2.9517, + "step": 1511500 + }, + { + "epoch": 0.47002721619231325, + "grad_norm": 9.653120994567871, + "learning_rate": 4.216621306346145e-05, + "loss": 3.0157, + "step": 1512000 + }, + { + "epoch": 0.4701826484728001, + "grad_norm": 8.883756637573242, + "learning_rate": 4.2163622525453336e-05, + "loss": 2.9499, + "step": 1512500 + }, + { + "epoch": 0.470338080753287, + "grad_norm": 7.332579135894775, + "learning_rate": 4.216103198744522e-05, + "loss": 2.9662, + "step": 1513000 + }, + { + "epoch": 0.4704935130337739, + "grad_norm": 10.117069244384766, + "learning_rate": 4.21584414494371e-05, + "loss": 2.982, + "step": 1513500 + }, + { + "epoch": 0.47064894531426077, + "grad_norm": 7.801002502441406, + "learning_rate": 4.215585091142899e-05, + "loss": 2.9883, + "step": 1514000 + }, + { + "epoch": 0.47080437759474764, + "grad_norm": 20.048770904541016, + "learning_rate": 4.215326037342088e-05, + "loss": 3.0212, + "step": 1514500 + }, + { + "epoch": 0.4709598098752345, + "grad_norm": 11.064713478088379, + "learning_rate": 4.215066983541276e-05, + "loss": 3.0066, + "step": 1515000 + }, + { + "epoch": 0.47111524215572137, + "grad_norm": 10.637811660766602, + "learning_rate": 4.2148079297404645e-05, + "loss": 2.9983, + "step": 1515500 + }, + { + "epoch": 0.47127067443620824, + "grad_norm": 13.067631721496582, + "learning_rate": 4.2145488759396525e-05, + "loss": 2.9886, + "step": 1516000 + }, + { + "epoch": 0.47142610671669516, + "grad_norm": 7.242276191711426, + "learning_rate": 4.214289822138842e-05, + "loss": 2.9698, + "step": 1516500 + }, + { + "epoch": 0.471581538997182, + "grad_norm": 8.174717903137207, + "learning_rate": 4.2140307683380306e-05, + "loss": 3.0058, + "step": 1517000 + }, + { + "epoch": 0.4717369712776689, + "grad_norm": 7.655884742736816, + "learning_rate": 4.213771714537219e-05, + "loss": 3.0133, + "step": 1517500 + }, + { + "epoch": 0.47189240355815576, + "grad_norm": 10.350359916687012, + "learning_rate": 4.2135126607364074e-05, + "loss": 2.9442, + "step": 1518000 + }, + { + "epoch": 0.4720478358386426, + "grad_norm": 15.614184379577637, + "learning_rate": 4.213253606935596e-05, + "loss": 2.9785, + "step": 1518500 + }, + { + "epoch": 0.4722032681191295, + "grad_norm": 7.625138759613037, + "learning_rate": 4.212994553134784e-05, + "loss": 3.0204, + "step": 1519000 + }, + { + "epoch": 0.4723587003996164, + "grad_norm": 8.434349060058594, + "learning_rate": 4.212735499333973e-05, + "loss": 2.9847, + "step": 1519500 + }, + { + "epoch": 0.4725141326801033, + "grad_norm": 13.638707160949707, + "learning_rate": 4.2124764455331616e-05, + "loss": 3.0409, + "step": 1520000 + }, + { + "epoch": 0.47266956496059015, + "grad_norm": 6.524325847625732, + "learning_rate": 4.2122173917323496e-05, + "loss": 3.0296, + "step": 1520500 + }, + { + "epoch": 0.472824997241077, + "grad_norm": 9.595826148986816, + "learning_rate": 4.211958337931538e-05, + "loss": 2.9463, + "step": 1521000 + }, + { + "epoch": 0.4729804295215639, + "grad_norm": 6.561254501342773, + "learning_rate": 4.211699284130727e-05, + "loss": 2.9784, + "step": 1521500 + }, + { + "epoch": 0.47313586180205075, + "grad_norm": 8.838866233825684, + "learning_rate": 4.211440230329916e-05, + "loss": 3.0028, + "step": 1522000 + }, + { + "epoch": 0.4732912940825377, + "grad_norm": 7.782862186431885, + "learning_rate": 4.2111811765291045e-05, + "loss": 3.0082, + "step": 1522500 + }, + { + "epoch": 0.47344672636302454, + "grad_norm": 14.910391807556152, + "learning_rate": 4.2109221227282925e-05, + "loss": 2.9625, + "step": 1523000 + }, + { + "epoch": 0.4736021586435114, + "grad_norm": 11.178431510925293, + "learning_rate": 4.210663068927481e-05, + "loss": 2.9789, + "step": 1523500 + }, + { + "epoch": 0.4737575909239983, + "grad_norm": 7.004639625549316, + "learning_rate": 4.21040401512667e-05, + "loss": 2.9867, + "step": 1524000 + }, + { + "epoch": 0.47391302320448514, + "grad_norm": 7.687747478485107, + "learning_rate": 4.210144961325858e-05, + "loss": 3.0157, + "step": 1524500 + }, + { + "epoch": 0.474068455484972, + "grad_norm": 20.457561492919922, + "learning_rate": 4.209885907525047e-05, + "loss": 2.9884, + "step": 1525000 + }, + { + "epoch": 0.47422388776545893, + "grad_norm": 7.42682409286499, + "learning_rate": 4.2096268537242354e-05, + "loss": 3.0292, + "step": 1525500 + }, + { + "epoch": 0.4743793200459458, + "grad_norm": 11.975543022155762, + "learning_rate": 4.2093677999234234e-05, + "loss": 2.9472, + "step": 1526000 + }, + { + "epoch": 0.47453475232643266, + "grad_norm": 8.103754997253418, + "learning_rate": 4.209108746122613e-05, + "loss": 2.9579, + "step": 1526500 + }, + { + "epoch": 0.47469018460691953, + "grad_norm": 8.857590675354004, + "learning_rate": 4.208849692321801e-05, + "loss": 3.0593, + "step": 1527000 + }, + { + "epoch": 0.4748456168874064, + "grad_norm": 10.141080856323242, + "learning_rate": 4.2085906385209896e-05, + "loss": 2.9956, + "step": 1527500 + }, + { + "epoch": 0.47500104916789326, + "grad_norm": 12.85505199432373, + "learning_rate": 4.208331584720178e-05, + "loss": 2.9932, + "step": 1528000 + }, + { + "epoch": 0.4751564814483802, + "grad_norm": 7.213264465332031, + "learning_rate": 4.2080725309193663e-05, + "loss": 2.982, + "step": 1528500 + }, + { + "epoch": 0.47531191372886705, + "grad_norm": 8.26455307006836, + "learning_rate": 4.207813477118555e-05, + "loss": 3.0246, + "step": 1529000 + }, + { + "epoch": 0.4754673460093539, + "grad_norm": 7.903802394866943, + "learning_rate": 4.207554423317744e-05, + "loss": 3.0068, + "step": 1529500 + }, + { + "epoch": 0.4756227782898408, + "grad_norm": 9.710756301879883, + "learning_rate": 4.207295369516932e-05, + "loss": 2.9736, + "step": 1530000 + }, + { + "epoch": 0.47577821057032765, + "grad_norm": 11.041851997375488, + "learning_rate": 4.2070363157161205e-05, + "loss": 3.0219, + "step": 1530500 + }, + { + "epoch": 0.4759336428508145, + "grad_norm": 9.746299743652344, + "learning_rate": 4.206777261915309e-05, + "loss": 3.0024, + "step": 1531000 + }, + { + "epoch": 0.47608907513130144, + "grad_norm": 9.580860137939453, + "learning_rate": 4.206518208114498e-05, + "loss": 3.0336, + "step": 1531500 + }, + { + "epoch": 0.4762445074117883, + "grad_norm": 12.486735343933105, + "learning_rate": 4.206259154313687e-05, + "loss": 3.0086, + "step": 1532000 + }, + { + "epoch": 0.4763999396922752, + "grad_norm": 9.212549209594727, + "learning_rate": 4.2060001005128754e-05, + "loss": 2.9761, + "step": 1532500 + }, + { + "epoch": 0.47655537197276204, + "grad_norm": 6.243236541748047, + "learning_rate": 4.2057410467120634e-05, + "loss": 2.9852, + "step": 1533000 + }, + { + "epoch": 0.4767108042532489, + "grad_norm": 9.140616416931152, + "learning_rate": 4.205481992911252e-05, + "loss": 2.9938, + "step": 1533500 + }, + { + "epoch": 0.4768662365337358, + "grad_norm": 9.160075187683105, + "learning_rate": 4.20522293911044e-05, + "loss": 3.0303, + "step": 1534000 + }, + { + "epoch": 0.4770216688142227, + "grad_norm": 8.060967445373535, + "learning_rate": 4.204963885309629e-05, + "loss": 2.9886, + "step": 1534500 + }, + { + "epoch": 0.47717710109470957, + "grad_norm": 15.80804443359375, + "learning_rate": 4.2047048315088176e-05, + "loss": 3.0287, + "step": 1535000 + }, + { + "epoch": 0.47733253337519643, + "grad_norm": 7.346706390380859, + "learning_rate": 4.2044457777080056e-05, + "loss": 2.9498, + "step": 1535500 + }, + { + "epoch": 0.4774879656556833, + "grad_norm": 7.319769382476807, + "learning_rate": 4.204186723907195e-05, + "loss": 3.0273, + "step": 1536000 + }, + { + "epoch": 0.47764339793617017, + "grad_norm": 6.74315881729126, + "learning_rate": 4.203927670106384e-05, + "loss": 2.9743, + "step": 1536500 + }, + { + "epoch": 0.47779883021665703, + "grad_norm": 7.575490474700928, + "learning_rate": 4.203668616305572e-05, + "loss": 3.0066, + "step": 1537000 + }, + { + "epoch": 0.47795426249714396, + "grad_norm": 9.071125984191895, + "learning_rate": 4.2034095625047605e-05, + "loss": 3.0302, + "step": 1537500 + }, + { + "epoch": 0.4781096947776308, + "grad_norm": 9.386677742004395, + "learning_rate": 4.203150508703949e-05, + "loss": 3.0183, + "step": 1538000 + }, + { + "epoch": 0.4782651270581177, + "grad_norm": 9.603699684143066, + "learning_rate": 4.202891454903137e-05, + "loss": 2.9835, + "step": 1538500 + }, + { + "epoch": 0.47842055933860456, + "grad_norm": 9.841085433959961, + "learning_rate": 4.202632401102326e-05, + "loss": 3.0131, + "step": 1539000 + }, + { + "epoch": 0.4785759916190914, + "grad_norm": 8.599462509155273, + "learning_rate": 4.202373347301514e-05, + "loss": 2.9973, + "step": 1539500 + }, + { + "epoch": 0.4787314238995783, + "grad_norm": 26.01806640625, + "learning_rate": 4.202114293500703e-05, + "loss": 3.0409, + "step": 1540000 + }, + { + "epoch": 0.4788868561800652, + "grad_norm": 8.528807640075684, + "learning_rate": 4.2018552396998914e-05, + "loss": 2.9989, + "step": 1540500 + }, + { + "epoch": 0.4790422884605521, + "grad_norm": 8.744241714477539, + "learning_rate": 4.20159618589908e-05, + "loss": 2.9809, + "step": 1541000 + }, + { + "epoch": 0.47919772074103895, + "grad_norm": 11.238310813903809, + "learning_rate": 4.201337132098269e-05, + "loss": 3.0022, + "step": 1541500 + }, + { + "epoch": 0.4793531530215258, + "grad_norm": 11.178656578063965, + "learning_rate": 4.2010780782974576e-05, + "loss": 2.9253, + "step": 1542000 + }, + { + "epoch": 0.4795085853020127, + "grad_norm": 8.03414535522461, + "learning_rate": 4.2008190244966456e-05, + "loss": 2.9525, + "step": 1542500 + }, + { + "epoch": 0.47966401758249955, + "grad_norm": 9.336945533752441, + "learning_rate": 4.2005599706958343e-05, + "loss": 2.9732, + "step": 1543000 + }, + { + "epoch": 0.47981944986298647, + "grad_norm": 7.4063029289245605, + "learning_rate": 4.200300916895023e-05, + "loss": 2.9834, + "step": 1543500 + }, + { + "epoch": 0.47997488214347334, + "grad_norm": 11.733283042907715, + "learning_rate": 4.200041863094211e-05, + "loss": 2.9885, + "step": 1544000 + }, + { + "epoch": 0.4801303144239602, + "grad_norm": 7.197287559509277, + "learning_rate": 4.1997828092934e-05, + "loss": 3.0195, + "step": 1544500 + }, + { + "epoch": 0.48028574670444707, + "grad_norm": 7.52111291885376, + "learning_rate": 4.199523755492588e-05, + "loss": 3.0109, + "step": 1545000 + }, + { + "epoch": 0.48044117898493394, + "grad_norm": 7.842710018157959, + "learning_rate": 4.1992647016917766e-05, + "loss": 2.9755, + "step": 1545500 + }, + { + "epoch": 0.4805966112654208, + "grad_norm": 8.829298973083496, + "learning_rate": 4.199005647890966e-05, + "loss": 2.9384, + "step": 1546000 + }, + { + "epoch": 0.4807520435459077, + "grad_norm": 8.377549171447754, + "learning_rate": 4.198746594090154e-05, + "loss": 2.974, + "step": 1546500 + }, + { + "epoch": 0.4809074758263946, + "grad_norm": 10.230284690856934, + "learning_rate": 4.198487540289343e-05, + "loss": 2.9827, + "step": 1547000 + }, + { + "epoch": 0.48106290810688146, + "grad_norm": 11.365165710449219, + "learning_rate": 4.1982284864885314e-05, + "loss": 3.0151, + "step": 1547500 + }, + { + "epoch": 0.4812183403873683, + "grad_norm": 18.65619659423828, + "learning_rate": 4.1979694326877195e-05, + "loss": 2.9846, + "step": 1548000 + }, + { + "epoch": 0.4813737726678552, + "grad_norm": 5.89503812789917, + "learning_rate": 4.197710378886908e-05, + "loss": 3.0398, + "step": 1548500 + }, + { + "epoch": 0.48152920494834206, + "grad_norm": 5.864975929260254, + "learning_rate": 4.197451325086097e-05, + "loss": 3.0441, + "step": 1549000 + }, + { + "epoch": 0.481684637228829, + "grad_norm": 6.989206790924072, + "learning_rate": 4.197192271285285e-05, + "loss": 3.0348, + "step": 1549500 + }, + { + "epoch": 0.48184006950931585, + "grad_norm": 9.514277458190918, + "learning_rate": 4.1969332174844736e-05, + "loss": 2.9973, + "step": 1550000 + }, + { + "epoch": 0.4819955017898027, + "grad_norm": 8.954666137695312, + "learning_rate": 4.1966741636836624e-05, + "loss": 2.9588, + "step": 1550500 + }, + { + "epoch": 0.4821509340702896, + "grad_norm": 7.557076930999756, + "learning_rate": 4.196415109882851e-05, + "loss": 2.9558, + "step": 1551000 + }, + { + "epoch": 0.48230636635077645, + "grad_norm": 7.915867805480957, + "learning_rate": 4.19615605608204e-05, + "loss": 3.0267, + "step": 1551500 + }, + { + "epoch": 0.4824617986312633, + "grad_norm": 21.85307502746582, + "learning_rate": 4.195897002281228e-05, + "loss": 2.9959, + "step": 1552000 + }, + { + "epoch": 0.48261723091175024, + "grad_norm": 9.787275314331055, + "learning_rate": 4.1956379484804165e-05, + "loss": 2.9949, + "step": 1552500 + }, + { + "epoch": 0.4827726631922371, + "grad_norm": 8.12041187286377, + "learning_rate": 4.195378894679605e-05, + "loss": 2.9735, + "step": 1553000 + }, + { + "epoch": 0.482928095472724, + "grad_norm": 9.800911903381348, + "learning_rate": 4.195119840878793e-05, + "loss": 3.0048, + "step": 1553500 + }, + { + "epoch": 0.48308352775321084, + "grad_norm": 7.510930061340332, + "learning_rate": 4.194860787077982e-05, + "loss": 2.9669, + "step": 1554000 + }, + { + "epoch": 0.4832389600336977, + "grad_norm": 7.146409034729004, + "learning_rate": 4.194601733277171e-05, + "loss": 2.9965, + "step": 1554500 + }, + { + "epoch": 0.4833943923141846, + "grad_norm": 8.304842948913574, + "learning_rate": 4.194342679476359e-05, + "loss": 3.0514, + "step": 1555000 + }, + { + "epoch": 0.4835498245946715, + "grad_norm": 10.014471054077148, + "learning_rate": 4.1940836256755475e-05, + "loss": 3.0182, + "step": 1555500 + }, + { + "epoch": 0.48370525687515836, + "grad_norm": 7.876931190490723, + "learning_rate": 4.193824571874737e-05, + "loss": 3.046, + "step": 1556000 + }, + { + "epoch": 0.48386068915564523, + "grad_norm": 9.034994125366211, + "learning_rate": 4.193565518073925e-05, + "loss": 3.0283, + "step": 1556500 + }, + { + "epoch": 0.4840161214361321, + "grad_norm": 5.44843864440918, + "learning_rate": 4.1933064642731136e-05, + "loss": 2.9818, + "step": 1557000 + }, + { + "epoch": 0.48417155371661896, + "grad_norm": 6.6321587562561035, + "learning_rate": 4.193047410472302e-05, + "loss": 2.9926, + "step": 1557500 + }, + { + "epoch": 0.48432698599710583, + "grad_norm": 6.311402320861816, + "learning_rate": 4.1927883566714904e-05, + "loss": 2.9767, + "step": 1558000 + }, + { + "epoch": 0.48448241827759275, + "grad_norm": 10.252067565917969, + "learning_rate": 4.192529302870679e-05, + "loss": 2.9837, + "step": 1558500 + }, + { + "epoch": 0.4846378505580796, + "grad_norm": 11.060343742370605, + "learning_rate": 4.192270249069867e-05, + "loss": 2.9807, + "step": 1559000 + }, + { + "epoch": 0.4847932828385665, + "grad_norm": 7.09246826171875, + "learning_rate": 4.192011195269056e-05, + "loss": 2.9967, + "step": 1559500 + }, + { + "epoch": 0.48494871511905335, + "grad_norm": 8.438859939575195, + "learning_rate": 4.1917521414682446e-05, + "loss": 3.0092, + "step": 1560000 + }, + { + "epoch": 0.4851041473995402, + "grad_norm": 7.668367385864258, + "learning_rate": 4.191493087667433e-05, + "loss": 3.0424, + "step": 1560500 + }, + { + "epoch": 0.4852595796800271, + "grad_norm": 13.192671775817871, + "learning_rate": 4.191234033866622e-05, + "loss": 2.9908, + "step": 1561000 + }, + { + "epoch": 0.485415011960514, + "grad_norm": 7.591129302978516, + "learning_rate": 4.190974980065811e-05, + "loss": 2.979, + "step": 1561500 + }, + { + "epoch": 0.4855704442410009, + "grad_norm": 6.8689188957214355, + "learning_rate": 4.190715926264999e-05, + "loss": 2.995, + "step": 1562000 + }, + { + "epoch": 0.48572587652148774, + "grad_norm": 18.441471099853516, + "learning_rate": 4.1904568724641875e-05, + "loss": 3.0072, + "step": 1562500 + }, + { + "epoch": 0.4858813088019746, + "grad_norm": 6.513213157653809, + "learning_rate": 4.1901978186633755e-05, + "loss": 2.9581, + "step": 1563000 + }, + { + "epoch": 0.4860367410824615, + "grad_norm": 8.606770515441895, + "learning_rate": 4.189938764862564e-05, + "loss": 3.0263, + "step": 1563500 + }, + { + "epoch": 0.48619217336294834, + "grad_norm": 7.711529731750488, + "learning_rate": 4.189679711061753e-05, + "loss": 2.9869, + "step": 1564000 + }, + { + "epoch": 0.48634760564343527, + "grad_norm": 9.057280540466309, + "learning_rate": 4.189420657260941e-05, + "loss": 3.0445, + "step": 1564500 + }, + { + "epoch": 0.48650303792392213, + "grad_norm": 6.258998394012451, + "learning_rate": 4.18916160346013e-05, + "loss": 2.9948, + "step": 1565000 + }, + { + "epoch": 0.486658470204409, + "grad_norm": 7.456698417663574, + "learning_rate": 4.1889025496593184e-05, + "loss": 2.9828, + "step": 1565500 + }, + { + "epoch": 0.48681390248489587, + "grad_norm": 9.373128890991211, + "learning_rate": 4.188643495858507e-05, + "loss": 2.9931, + "step": 1566000 + }, + { + "epoch": 0.48696933476538273, + "grad_norm": 9.307513236999512, + "learning_rate": 4.188384442057696e-05, + "loss": 3.0258, + "step": 1566500 + }, + { + "epoch": 0.4871247670458696, + "grad_norm": 8.579151153564453, + "learning_rate": 4.1881253882568845e-05, + "loss": 2.9686, + "step": 1567000 + }, + { + "epoch": 0.4872801993263565, + "grad_norm": 8.502559661865234, + "learning_rate": 4.1878663344560726e-05, + "loss": 3.0333, + "step": 1567500 + }, + { + "epoch": 0.4874356316068434, + "grad_norm": 7.7345051765441895, + "learning_rate": 4.187607280655261e-05, + "loss": 3.0253, + "step": 1568000 + }, + { + "epoch": 0.48759106388733026, + "grad_norm": 7.930747985839844, + "learning_rate": 4.18734822685445e-05, + "loss": 3.0192, + "step": 1568500 + }, + { + "epoch": 0.4877464961678171, + "grad_norm": 10.383498191833496, + "learning_rate": 4.187089173053638e-05, + "loss": 2.9607, + "step": 1569000 + }, + { + "epoch": 0.487901928448304, + "grad_norm": 9.311981201171875, + "learning_rate": 4.186830119252827e-05, + "loss": 2.9848, + "step": 1569500 + }, + { + "epoch": 0.48805736072879086, + "grad_norm": 9.20810604095459, + "learning_rate": 4.1865710654520155e-05, + "loss": 3.0017, + "step": 1570000 + }, + { + "epoch": 0.4882127930092778, + "grad_norm": 9.45262336730957, + "learning_rate": 4.186312011651204e-05, + "loss": 3.0256, + "step": 1570500 + }, + { + "epoch": 0.48836822528976465, + "grad_norm": 7.561440944671631, + "learning_rate": 4.186052957850393e-05, + "loss": 3.0355, + "step": 1571000 + }, + { + "epoch": 0.4885236575702515, + "grad_norm": 11.878747940063477, + "learning_rate": 4.185793904049581e-05, + "loss": 2.9895, + "step": 1571500 + }, + { + "epoch": 0.4886790898507384, + "grad_norm": 7.327657222747803, + "learning_rate": 4.18553485024877e-05, + "loss": 2.9356, + "step": 1572000 + }, + { + "epoch": 0.48883452213122525, + "grad_norm": 8.651468276977539, + "learning_rate": 4.1852757964479584e-05, + "loss": 3.0282, + "step": 1572500 + }, + { + "epoch": 0.4889899544117121, + "grad_norm": 5.334338188171387, + "learning_rate": 4.1850167426471464e-05, + "loss": 2.9903, + "step": 1573000 + }, + { + "epoch": 0.48914538669219904, + "grad_norm": 9.150879859924316, + "learning_rate": 4.184757688846335e-05, + "loss": 2.9848, + "step": 1573500 + }, + { + "epoch": 0.4893008189726859, + "grad_norm": 8.757585525512695, + "learning_rate": 4.184498635045524e-05, + "loss": 2.9458, + "step": 1574000 + }, + { + "epoch": 0.48945625125317277, + "grad_norm": 9.970255851745605, + "learning_rate": 4.184239581244712e-05, + "loss": 2.9469, + "step": 1574500 + }, + { + "epoch": 0.48961168353365964, + "grad_norm": 7.490023136138916, + "learning_rate": 4.1839805274439006e-05, + "loss": 2.9906, + "step": 1575000 + }, + { + "epoch": 0.4897671158141465, + "grad_norm": 9.169057846069336, + "learning_rate": 4.183721473643089e-05, + "loss": 2.97, + "step": 1575500 + }, + { + "epoch": 0.48992254809463337, + "grad_norm": 9.130858421325684, + "learning_rate": 4.183462419842278e-05, + "loss": 2.9934, + "step": 1576000 + }, + { + "epoch": 0.4900779803751203, + "grad_norm": 9.604486465454102, + "learning_rate": 4.183203366041467e-05, + "loss": 2.9988, + "step": 1576500 + }, + { + "epoch": 0.49023341265560716, + "grad_norm": 13.83260440826416, + "learning_rate": 4.182944312240655e-05, + "loss": 2.9862, + "step": 1577000 + }, + { + "epoch": 0.490388844936094, + "grad_norm": 7.597769260406494, + "learning_rate": 4.1826852584398435e-05, + "loss": 2.9944, + "step": 1577500 + }, + { + "epoch": 0.4905442772165809, + "grad_norm": 30.365854263305664, + "learning_rate": 4.182426204639032e-05, + "loss": 3.0351, + "step": 1578000 + }, + { + "epoch": 0.49069970949706776, + "grad_norm": 9.998760223388672, + "learning_rate": 4.18216715083822e-05, + "loss": 3.0149, + "step": 1578500 + }, + { + "epoch": 0.4908551417775546, + "grad_norm": 10.259760856628418, + "learning_rate": 4.181908097037409e-05, + "loss": 2.9833, + "step": 1579000 + }, + { + "epoch": 0.49101057405804155, + "grad_norm": 9.25013542175293, + "learning_rate": 4.181649043236598e-05, + "loss": 2.9899, + "step": 1579500 + }, + { + "epoch": 0.4911660063385284, + "grad_norm": 7.979147911071777, + "learning_rate": 4.1813899894357864e-05, + "loss": 2.9632, + "step": 1580000 + }, + { + "epoch": 0.4913214386190153, + "grad_norm": 8.876039505004883, + "learning_rate": 4.181130935634975e-05, + "loss": 3.0059, + "step": 1580500 + }, + { + "epoch": 0.49147687089950215, + "grad_norm": 7.212477684020996, + "learning_rate": 4.180871881834163e-05, + "loss": 3.0162, + "step": 1581000 + }, + { + "epoch": 0.491632303179989, + "grad_norm": 7.716450214385986, + "learning_rate": 4.180612828033352e-05, + "loss": 2.977, + "step": 1581500 + }, + { + "epoch": 0.4917877354604759, + "grad_norm": 6.326106071472168, + "learning_rate": 4.1803537742325406e-05, + "loss": 3.0056, + "step": 1582000 + }, + { + "epoch": 0.4919431677409628, + "grad_norm": 7.3688883781433105, + "learning_rate": 4.1800947204317286e-05, + "loss": 2.9385, + "step": 1582500 + }, + { + "epoch": 0.4920986000214497, + "grad_norm": 15.498443603515625, + "learning_rate": 4.1798356666309173e-05, + "loss": 3.0029, + "step": 1583000 + }, + { + "epoch": 0.49225403230193654, + "grad_norm": 32.21231460571289, + "learning_rate": 4.179576612830106e-05, + "loss": 3.0296, + "step": 1583500 + }, + { + "epoch": 0.4924094645824234, + "grad_norm": 6.667564868927002, + "learning_rate": 4.179317559029294e-05, + "loss": 2.9562, + "step": 1584000 + }, + { + "epoch": 0.4925648968629103, + "grad_norm": 7.727179527282715, + "learning_rate": 4.179058505228483e-05, + "loss": 2.9906, + "step": 1584500 + }, + { + "epoch": 0.49272032914339714, + "grad_norm": 10.33765983581543, + "learning_rate": 4.1787994514276715e-05, + "loss": 2.9643, + "step": 1585000 + }, + { + "epoch": 0.49287576142388406, + "grad_norm": 10.334589004516602, + "learning_rate": 4.17854039762686e-05, + "loss": 3.0131, + "step": 1585500 + }, + { + "epoch": 0.49303119370437093, + "grad_norm": 23.153295516967773, + "learning_rate": 4.178281343826049e-05, + "loss": 2.969, + "step": 1586000 + }, + { + "epoch": 0.4931866259848578, + "grad_norm": 10.3615083694458, + "learning_rate": 4.178022290025238e-05, + "loss": 3.0256, + "step": 1586500 + }, + { + "epoch": 0.49334205826534466, + "grad_norm": 7.920697212219238, + "learning_rate": 4.177763236224426e-05, + "loss": 2.9915, + "step": 1587000 + }, + { + "epoch": 0.49349749054583153, + "grad_norm": 7.858769416809082, + "learning_rate": 4.1775041824236144e-05, + "loss": 3.0059, + "step": 1587500 + }, + { + "epoch": 0.4936529228263184, + "grad_norm": 10.092107772827148, + "learning_rate": 4.1772451286228025e-05, + "loss": 2.9998, + "step": 1588000 + }, + { + "epoch": 0.4938083551068053, + "grad_norm": 10.020526885986328, + "learning_rate": 4.176986074821991e-05, + "loss": 2.9867, + "step": 1588500 + }, + { + "epoch": 0.4939637873872922, + "grad_norm": 8.114276885986328, + "learning_rate": 4.17672702102118e-05, + "loss": 2.9662, + "step": 1589000 + }, + { + "epoch": 0.49411921966777905, + "grad_norm": 9.188462257385254, + "learning_rate": 4.176467967220368e-05, + "loss": 2.9867, + "step": 1589500 + }, + { + "epoch": 0.4942746519482659, + "grad_norm": 10.056182861328125, + "learning_rate": 4.176208913419557e-05, + "loss": 3.0256, + "step": 1590000 + }, + { + "epoch": 0.4944300842287528, + "grad_norm": 19.037912368774414, + "learning_rate": 4.175949859618746e-05, + "loss": 2.9434, + "step": 1590500 + }, + { + "epoch": 0.49458551650923965, + "grad_norm": 8.740572929382324, + "learning_rate": 4.175690805817934e-05, + "loss": 2.9663, + "step": 1591000 + }, + { + "epoch": 0.4947409487897266, + "grad_norm": 8.26159381866455, + "learning_rate": 4.175431752017123e-05, + "loss": 2.9973, + "step": 1591500 + }, + { + "epoch": 0.49489638107021344, + "grad_norm": 12.182031631469727, + "learning_rate": 4.1751726982163115e-05, + "loss": 2.9486, + "step": 1592000 + }, + { + "epoch": 0.4950518133507003, + "grad_norm": 5.427419662475586, + "learning_rate": 4.1749136444154995e-05, + "loss": 2.938, + "step": 1592500 + }, + { + "epoch": 0.4952072456311872, + "grad_norm": 8.057446479797363, + "learning_rate": 4.174654590614688e-05, + "loss": 2.9944, + "step": 1593000 + }, + { + "epoch": 0.49536267791167404, + "grad_norm": 7.7029500007629395, + "learning_rate": 4.174395536813876e-05, + "loss": 2.9759, + "step": 1593500 + }, + { + "epoch": 0.4955181101921609, + "grad_norm": 7.9611616134643555, + "learning_rate": 4.174136483013065e-05, + "loss": 3.0118, + "step": 1594000 + }, + { + "epoch": 0.4956735424726478, + "grad_norm": 8.286368370056152, + "learning_rate": 4.173877429212254e-05, + "loss": 2.9935, + "step": 1594500 + }, + { + "epoch": 0.4958289747531347, + "grad_norm": 9.888763427734375, + "learning_rate": 4.1736183754114424e-05, + "loss": 3.0062, + "step": 1595000 + }, + { + "epoch": 0.49598440703362157, + "grad_norm": 9.872170448303223, + "learning_rate": 4.173359321610631e-05, + "loss": 3.0208, + "step": 1595500 + }, + { + "epoch": 0.49613983931410843, + "grad_norm": 11.583695411682129, + "learning_rate": 4.17310026780982e-05, + "loss": 2.9856, + "step": 1596000 + }, + { + "epoch": 0.4962952715945953, + "grad_norm": 7.401259899139404, + "learning_rate": 4.172841214009008e-05, + "loss": 2.9541, + "step": 1596500 + }, + { + "epoch": 0.49645070387508217, + "grad_norm": 6.840715408325195, + "learning_rate": 4.1725821602081966e-05, + "loss": 2.9825, + "step": 1597000 + }, + { + "epoch": 0.49660613615556903, + "grad_norm": 13.321316719055176, + "learning_rate": 4.1723231064073853e-05, + "loss": 2.9844, + "step": 1597500 + }, + { + "epoch": 0.49676156843605596, + "grad_norm": 8.818543434143066, + "learning_rate": 4.1720640526065734e-05, + "loss": 2.945, + "step": 1598000 + }, + { + "epoch": 0.4969170007165428, + "grad_norm": 11.288419723510742, + "learning_rate": 4.171804998805762e-05, + "loss": 3.0011, + "step": 1598500 + }, + { + "epoch": 0.4970724329970297, + "grad_norm": 8.1756591796875, + "learning_rate": 4.171545945004951e-05, + "loss": 3.024, + "step": 1599000 + }, + { + "epoch": 0.49722786527751656, + "grad_norm": 7.476779460906982, + "learning_rate": 4.171286891204139e-05, + "loss": 2.9931, + "step": 1599500 + }, + { + "epoch": 0.4973832975580034, + "grad_norm": 14.615345001220703, + "learning_rate": 4.171027837403328e-05, + "loss": 2.9629, + "step": 1600000 + }, + { + "epoch": 0.4975387298384903, + "grad_norm": 14.677370071411133, + "learning_rate": 4.170768783602516e-05, + "loss": 2.9282, + "step": 1600500 + }, + { + "epoch": 0.4976941621189772, + "grad_norm": 6.205915927886963, + "learning_rate": 4.170509729801705e-05, + "loss": 2.9924, + "step": 1601000 + }, + { + "epoch": 0.4978495943994641, + "grad_norm": 7.97024393081665, + "learning_rate": 4.170250676000894e-05, + "loss": 3.0059, + "step": 1601500 + }, + { + "epoch": 0.49800502667995095, + "grad_norm": 12.274286270141602, + "learning_rate": 4.169991622200082e-05, + "loss": 3.0088, + "step": 1602000 + }, + { + "epoch": 0.4981604589604378, + "grad_norm": 8.152814865112305, + "learning_rate": 4.1697325683992705e-05, + "loss": 3.0261, + "step": 1602500 + }, + { + "epoch": 0.4983158912409247, + "grad_norm": 10.35328483581543, + "learning_rate": 4.169473514598459e-05, + "loss": 3.0108, + "step": 1603000 + }, + { + "epoch": 0.49847132352141155, + "grad_norm": 6.56851863861084, + "learning_rate": 4.169214460797647e-05, + "loss": 3.044, + "step": 1603500 + }, + { + "epoch": 0.49862675580189847, + "grad_norm": 8.182966232299805, + "learning_rate": 4.168955406996836e-05, + "loss": 2.9957, + "step": 1604000 + }, + { + "epoch": 0.49878218808238534, + "grad_norm": 9.19281005859375, + "learning_rate": 4.1686963531960246e-05, + "loss": 2.9845, + "step": 1604500 + }, + { + "epoch": 0.4989376203628722, + "grad_norm": 7.337644577026367, + "learning_rate": 4.1684372993952134e-05, + "loss": 2.9839, + "step": 1605000 + }, + { + "epoch": 0.49909305264335907, + "grad_norm": 6.611823081970215, + "learning_rate": 4.168178245594402e-05, + "loss": 3.0047, + "step": 1605500 + }, + { + "epoch": 0.49924848492384594, + "grad_norm": 8.948554992675781, + "learning_rate": 4.16791919179359e-05, + "loss": 3.0124, + "step": 1606000 + }, + { + "epoch": 0.4994039172043328, + "grad_norm": 13.644035339355469, + "learning_rate": 4.167660137992779e-05, + "loss": 3.0011, + "step": 1606500 + }, + { + "epoch": 0.4995593494848197, + "grad_norm": 8.744574546813965, + "learning_rate": 4.1674010841919675e-05, + "loss": 2.9454, + "step": 1607000 + }, + { + "epoch": 0.4997147817653066, + "grad_norm": 8.299756050109863, + "learning_rate": 4.1671420303911556e-05, + "loss": 3.0119, + "step": 1607500 + }, + { + "epoch": 0.49987021404579346, + "grad_norm": 6.491403579711914, + "learning_rate": 4.166882976590344e-05, + "loss": 3.0064, + "step": 1608000 + }, + { + "epoch": 0.5000256463262803, + "grad_norm": 10.078863143920898, + "learning_rate": 4.166623922789533e-05, + "loss": 2.9867, + "step": 1608500 + }, + { + "epoch": 0.5001810786067672, + "grad_norm": 8.548155784606934, + "learning_rate": 4.166364868988721e-05, + "loss": 2.9391, + "step": 1609000 + }, + { + "epoch": 0.5003365108872541, + "grad_norm": 12.162303924560547, + "learning_rate": 4.16610581518791e-05, + "loss": 2.983, + "step": 1609500 + }, + { + "epoch": 0.5004919431677409, + "grad_norm": 11.335734367370605, + "learning_rate": 4.165846761387099e-05, + "loss": 2.9882, + "step": 1610000 + }, + { + "epoch": 0.5006473754482278, + "grad_norm": 17.077932357788086, + "learning_rate": 4.165587707586287e-05, + "loss": 2.9607, + "step": 1610500 + }, + { + "epoch": 0.5008028077287147, + "grad_norm": 8.846359252929688, + "learning_rate": 4.165328653785476e-05, + "loss": 3.0259, + "step": 1611000 + }, + { + "epoch": 0.5009582400092016, + "grad_norm": 10.704377174377441, + "learning_rate": 4.165069599984664e-05, + "loss": 3.0112, + "step": 1611500 + }, + { + "epoch": 0.5011136722896885, + "grad_norm": 14.57355785369873, + "learning_rate": 4.164810546183853e-05, + "loss": 2.998, + "step": 1612000 + }, + { + "epoch": 0.5012691045701754, + "grad_norm": 8.471502304077148, + "learning_rate": 4.1645514923830414e-05, + "loss": 2.9912, + "step": 1612500 + }, + { + "epoch": 0.5014245368506622, + "grad_norm": 9.317343711853027, + "learning_rate": 4.1642924385822294e-05, + "loss": 3.0324, + "step": 1613000 + }, + { + "epoch": 0.5015799691311491, + "grad_norm": 12.618123054504395, + "learning_rate": 4.164033384781418e-05, + "loss": 2.9936, + "step": 1613500 + }, + { + "epoch": 0.501735401411636, + "grad_norm": 10.003442764282227, + "learning_rate": 4.163774330980607e-05, + "loss": 3.0338, + "step": 1614000 + }, + { + "epoch": 0.5018908336921228, + "grad_norm": 8.678059577941895, + "learning_rate": 4.1635152771797956e-05, + "loss": 2.998, + "step": 1614500 + }, + { + "epoch": 0.5020462659726097, + "grad_norm": 7.122654914855957, + "learning_rate": 4.163256223378984e-05, + "loss": 2.981, + "step": 1615000 + }, + { + "epoch": 0.5022016982530966, + "grad_norm": 8.33456039428711, + "learning_rate": 4.162997169578173e-05, + "loss": 3.053, + "step": 1615500 + }, + { + "epoch": 0.5023571305335834, + "grad_norm": 8.46042251586914, + "learning_rate": 4.162738115777361e-05, + "loss": 3.026, + "step": 1616000 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 5.645463466644287, + "learning_rate": 4.16247906197655e-05, + "loss": 2.9797, + "step": 1616500 + }, + { + "epoch": 0.5026679950945572, + "grad_norm": 8.48398494720459, + "learning_rate": 4.1622200081757385e-05, + "loss": 2.9984, + "step": 1617000 + }, + { + "epoch": 0.5028234273750442, + "grad_norm": 23.01471519470215, + "learning_rate": 4.1619609543749265e-05, + "loss": 2.9768, + "step": 1617500 + }, + { + "epoch": 0.502978859655531, + "grad_norm": 8.315071105957031, + "learning_rate": 4.161701900574115e-05, + "loss": 2.9634, + "step": 1618000 + }, + { + "epoch": 0.5031342919360179, + "grad_norm": 23.66253089904785, + "learning_rate": 4.161442846773303e-05, + "loss": 3.0412, + "step": 1618500 + }, + { + "epoch": 0.5032897242165048, + "grad_norm": 8.918397903442383, + "learning_rate": 4.161183792972492e-05, + "loss": 3.0196, + "step": 1619000 + }, + { + "epoch": 0.5034451564969916, + "grad_norm": 10.410909652709961, + "learning_rate": 4.160924739171681e-05, + "loss": 2.9727, + "step": 1619500 + }, + { + "epoch": 0.5036005887774785, + "grad_norm": 8.234715461730957, + "learning_rate": 4.1606656853708694e-05, + "loss": 2.9518, + "step": 1620000 + }, + { + "epoch": 0.5037560210579654, + "grad_norm": 7.849624156951904, + "learning_rate": 4.160406631570058e-05, + "loss": 2.9787, + "step": 1620500 + }, + { + "epoch": 0.5039114533384522, + "grad_norm": 9.417266845703125, + "learning_rate": 4.160147577769247e-05, + "loss": 2.9811, + "step": 1621000 + }, + { + "epoch": 0.5040668856189391, + "grad_norm": 6.076145648956299, + "learning_rate": 4.159888523968435e-05, + "loss": 2.9939, + "step": 1621500 + }, + { + "epoch": 0.504222317899426, + "grad_norm": 29.80683135986328, + "learning_rate": 4.1596294701676236e-05, + "loss": 2.9764, + "step": 1622000 + }, + { + "epoch": 0.5043777501799128, + "grad_norm": 7.977956771850586, + "learning_rate": 4.159370416366812e-05, + "loss": 3.017, + "step": 1622500 + }, + { + "epoch": 0.5045331824603997, + "grad_norm": 8.633633613586426, + "learning_rate": 4.159111362566e-05, + "loss": 2.9681, + "step": 1623000 + }, + { + "epoch": 0.5046886147408867, + "grad_norm": 7.865110874176025, + "learning_rate": 4.158852308765189e-05, + "loss": 3.0178, + "step": 1623500 + }, + { + "epoch": 0.5048440470213735, + "grad_norm": 9.483219146728516, + "learning_rate": 4.158593254964378e-05, + "loss": 2.9539, + "step": 1624000 + }, + { + "epoch": 0.5049994793018604, + "grad_norm": 9.61340618133545, + "learning_rate": 4.1583342011635665e-05, + "loss": 2.9802, + "step": 1624500 + }, + { + "epoch": 0.5051549115823473, + "grad_norm": 6.565794467926025, + "learning_rate": 4.158075147362755e-05, + "loss": 2.9688, + "step": 1625000 + }, + { + "epoch": 0.5053103438628341, + "grad_norm": 8.747660636901855, + "learning_rate": 4.157816093561943e-05, + "loss": 2.9356, + "step": 1625500 + }, + { + "epoch": 0.505465776143321, + "grad_norm": 13.495156288146973, + "learning_rate": 4.157557039761132e-05, + "loss": 3.0118, + "step": 1626000 + }, + { + "epoch": 0.5056212084238079, + "grad_norm": 27.394878387451172, + "learning_rate": 4.157297985960321e-05, + "loss": 2.9854, + "step": 1626500 + }, + { + "epoch": 0.5057766407042947, + "grad_norm": 22.356338500976562, + "learning_rate": 4.157038932159509e-05, + "loss": 2.9816, + "step": 1627000 + }, + { + "epoch": 0.5059320729847816, + "grad_norm": 9.313362121582031, + "learning_rate": 4.1567798783586974e-05, + "loss": 2.9602, + "step": 1627500 + }, + { + "epoch": 0.5060875052652685, + "grad_norm": 8.605875015258789, + "learning_rate": 4.156520824557886e-05, + "loss": 2.9627, + "step": 1628000 + }, + { + "epoch": 0.5062429375457553, + "grad_norm": 7.183097839355469, + "learning_rate": 4.156261770757074e-05, + "loss": 3.0069, + "step": 1628500 + }, + { + "epoch": 0.5063983698262422, + "grad_norm": 9.36975383758545, + "learning_rate": 4.156002716956263e-05, + "loss": 2.9933, + "step": 1629000 + }, + { + "epoch": 0.5065538021067292, + "grad_norm": 7.84438943862915, + "learning_rate": 4.1557436631554516e-05, + "loss": 3.0366, + "step": 1629500 + }, + { + "epoch": 0.506709234387216, + "grad_norm": 7.437487602233887, + "learning_rate": 4.15548460935464e-05, + "loss": 3.0119, + "step": 1630000 + }, + { + "epoch": 0.5068646666677029, + "grad_norm": 9.733963966369629, + "learning_rate": 4.155225555553829e-05, + "loss": 3.0043, + "step": 1630500 + }, + { + "epoch": 0.5070200989481898, + "grad_norm": 7.822879791259766, + "learning_rate": 4.154966501753017e-05, + "loss": 2.9603, + "step": 1631000 + }, + { + "epoch": 0.5071755312286766, + "grad_norm": 15.17553997039795, + "learning_rate": 4.154707447952206e-05, + "loss": 3.004, + "step": 1631500 + }, + { + "epoch": 0.5073309635091635, + "grad_norm": 9.292868614196777, + "learning_rate": 4.1544483941513945e-05, + "loss": 2.988, + "step": 1632000 + }, + { + "epoch": 0.5074863957896504, + "grad_norm": 7.613121032714844, + "learning_rate": 4.1541893403505825e-05, + "loss": 2.9542, + "step": 1632500 + }, + { + "epoch": 0.5076418280701372, + "grad_norm": 16.71627426147461, + "learning_rate": 4.153930286549771e-05, + "loss": 3.0347, + "step": 1633000 + }, + { + "epoch": 0.5077972603506241, + "grad_norm": 7.885018348693848, + "learning_rate": 4.15367123274896e-05, + "loss": 2.9811, + "step": 1633500 + }, + { + "epoch": 0.507952692631111, + "grad_norm": 10.06120777130127, + "learning_rate": 4.153412178948149e-05, + "loss": 3.0141, + "step": 1634000 + }, + { + "epoch": 0.5081081249115978, + "grad_norm": 20.286537170410156, + "learning_rate": 4.1531531251473374e-05, + "loss": 2.9873, + "step": 1634500 + }, + { + "epoch": 0.5082635571920847, + "grad_norm": 8.463911056518555, + "learning_rate": 4.152894071346526e-05, + "loss": 3.0285, + "step": 1635000 + }, + { + "epoch": 0.5084189894725717, + "grad_norm": 7.06170654296875, + "learning_rate": 4.152635017545714e-05, + "loss": 2.9514, + "step": 1635500 + }, + { + "epoch": 0.5085744217530586, + "grad_norm": 8.16528606414795, + "learning_rate": 4.152375963744903e-05, + "loss": 2.9861, + "step": 1636000 + }, + { + "epoch": 0.5087298540335454, + "grad_norm": 6.922329425811768, + "learning_rate": 4.152116909944091e-05, + "loss": 2.9977, + "step": 1636500 + }, + { + "epoch": 0.5088852863140323, + "grad_norm": 8.915877342224121, + "learning_rate": 4.1518578561432796e-05, + "loss": 2.9901, + "step": 1637000 + }, + { + "epoch": 0.5090407185945192, + "grad_norm": 9.112048149108887, + "learning_rate": 4.1515988023424683e-05, + "loss": 2.9736, + "step": 1637500 + }, + { + "epoch": 0.509196150875006, + "grad_norm": 8.106758117675781, + "learning_rate": 4.1513397485416564e-05, + "loss": 3.0185, + "step": 1638000 + }, + { + "epoch": 0.5093515831554929, + "grad_norm": 7.001745700836182, + "learning_rate": 4.151080694740845e-05, + "loss": 3.0053, + "step": 1638500 + }, + { + "epoch": 0.5095070154359798, + "grad_norm": 7.316071510314941, + "learning_rate": 4.150821640940034e-05, + "loss": 3.0078, + "step": 1639000 + }, + { + "epoch": 0.5096624477164666, + "grad_norm": 9.291595458984375, + "learning_rate": 4.1505625871392225e-05, + "loss": 2.9813, + "step": 1639500 + }, + { + "epoch": 0.5098178799969535, + "grad_norm": 7.652420997619629, + "learning_rate": 4.150303533338411e-05, + "loss": 2.9945, + "step": 1640000 + }, + { + "epoch": 0.5099733122774404, + "grad_norm": 8.232155799865723, + "learning_rate": 4.1500444795376e-05, + "loss": 2.9988, + "step": 1640500 + }, + { + "epoch": 0.5101287445579272, + "grad_norm": 7.344640254974365, + "learning_rate": 4.149785425736788e-05, + "loss": 2.9892, + "step": 1641000 + }, + { + "epoch": 0.5102841768384142, + "grad_norm": 7.96024227142334, + "learning_rate": 4.149526371935977e-05, + "loss": 2.9777, + "step": 1641500 + }, + { + "epoch": 0.5104396091189011, + "grad_norm": 17.92547607421875, + "learning_rate": 4.149267318135165e-05, + "loss": 2.9684, + "step": 1642000 + }, + { + "epoch": 0.5105950413993879, + "grad_norm": 7.3407883644104, + "learning_rate": 4.1490082643343535e-05, + "loss": 2.9564, + "step": 1642500 + }, + { + "epoch": 0.5107504736798748, + "grad_norm": 7.824656009674072, + "learning_rate": 4.148749210533542e-05, + "loss": 3.005, + "step": 1643000 + }, + { + "epoch": 0.5109059059603617, + "grad_norm": 8.50614070892334, + "learning_rate": 4.148490156732731e-05, + "loss": 3.0242, + "step": 1643500 + }, + { + "epoch": 0.5110613382408485, + "grad_norm": 10.23419189453125, + "learning_rate": 4.1482311029319196e-05, + "loss": 3.0126, + "step": 1644000 + }, + { + "epoch": 0.5112167705213354, + "grad_norm": 6.7730183601379395, + "learning_rate": 4.147972049131108e-05, + "loss": 2.9534, + "step": 1644500 + }, + { + "epoch": 0.5113722028018223, + "grad_norm": 7.92793607711792, + "learning_rate": 4.1477129953302964e-05, + "loss": 2.9554, + "step": 1645000 + }, + { + "epoch": 0.5115276350823091, + "grad_norm": 6.769281387329102, + "learning_rate": 4.147453941529485e-05, + "loss": 2.978, + "step": 1645500 + }, + { + "epoch": 0.511683067362796, + "grad_norm": 7.430790901184082, + "learning_rate": 4.147194887728674e-05, + "loss": 2.981, + "step": 1646000 + }, + { + "epoch": 0.5118384996432829, + "grad_norm": 8.782576560974121, + "learning_rate": 4.146935833927862e-05, + "loss": 2.9938, + "step": 1646500 + }, + { + "epoch": 0.5119939319237697, + "grad_norm": 8.063702583312988, + "learning_rate": 4.1466767801270505e-05, + "loss": 2.9695, + "step": 1647000 + }, + { + "epoch": 0.5121493642042567, + "grad_norm": 8.773962020874023, + "learning_rate": 4.1464177263262386e-05, + "loss": 2.9318, + "step": 1647500 + }, + { + "epoch": 0.5123047964847436, + "grad_norm": 7.601384162902832, + "learning_rate": 4.146158672525427e-05, + "loss": 2.9658, + "step": 1648000 + }, + { + "epoch": 0.5124602287652305, + "grad_norm": 8.56450080871582, + "learning_rate": 4.145899618724616e-05, + "loss": 2.9677, + "step": 1648500 + }, + { + "epoch": 0.5126156610457173, + "grad_norm": 9.552669525146484, + "learning_rate": 4.145640564923805e-05, + "loss": 3.0454, + "step": 1649000 + }, + { + "epoch": 0.5127710933262042, + "grad_norm": 9.074074745178223, + "learning_rate": 4.1453815111229934e-05, + "loss": 3.0033, + "step": 1649500 + }, + { + "epoch": 0.512926525606691, + "grad_norm": 6.021200180053711, + "learning_rate": 4.145122457322182e-05, + "loss": 2.9752, + "step": 1650000 + }, + { + "epoch": 0.5130819578871779, + "grad_norm": 14.610686302185059, + "learning_rate": 4.14486340352137e-05, + "loss": 3.0192, + "step": 1650500 + }, + { + "epoch": 0.5132373901676648, + "grad_norm": 9.235920906066895, + "learning_rate": 4.144604349720559e-05, + "loss": 2.9626, + "step": 1651000 + }, + { + "epoch": 0.5133928224481517, + "grad_norm": 20.57294464111328, + "learning_rate": 4.1443452959197476e-05, + "loss": 3.0321, + "step": 1651500 + }, + { + "epoch": 0.5135482547286385, + "grad_norm": 8.491010665893555, + "learning_rate": 4.144086242118936e-05, + "loss": 2.9874, + "step": 1652000 + }, + { + "epoch": 0.5137036870091254, + "grad_norm": 7.612607002258301, + "learning_rate": 4.1438271883181244e-05, + "loss": 2.9556, + "step": 1652500 + }, + { + "epoch": 0.5138591192896123, + "grad_norm": 8.275647163391113, + "learning_rate": 4.143568134517313e-05, + "loss": 2.9591, + "step": 1653000 + }, + { + "epoch": 0.5140145515700992, + "grad_norm": 4.298964977264404, + "learning_rate": 4.143309080716502e-05, + "loss": 3.0427, + "step": 1653500 + }, + { + "epoch": 0.5141699838505861, + "grad_norm": 7.588286876678467, + "learning_rate": 4.1430500269156905e-05, + "loss": 2.9995, + "step": 1654000 + }, + { + "epoch": 0.514325416131073, + "grad_norm": 6.678065776824951, + "learning_rate": 4.1427909731148786e-05, + "loss": 3.032, + "step": 1654500 + }, + { + "epoch": 0.5144808484115598, + "grad_norm": 19.68391990661621, + "learning_rate": 4.142531919314067e-05, + "loss": 2.9973, + "step": 1655000 + }, + { + "epoch": 0.5146362806920467, + "grad_norm": 10.899819374084473, + "learning_rate": 4.142272865513256e-05, + "loss": 2.9937, + "step": 1655500 + }, + { + "epoch": 0.5147917129725336, + "grad_norm": 6.3640828132629395, + "learning_rate": 4.142013811712444e-05, + "loss": 2.9531, + "step": 1656000 + }, + { + "epoch": 0.5149471452530204, + "grad_norm": 8.831696510314941, + "learning_rate": 4.141754757911633e-05, + "loss": 2.9992, + "step": 1656500 + }, + { + "epoch": 0.5151025775335073, + "grad_norm": 7.968363285064697, + "learning_rate": 4.1414957041108215e-05, + "loss": 3.0333, + "step": 1657000 + }, + { + "epoch": 0.5152580098139942, + "grad_norm": 42.909183502197266, + "learning_rate": 4.1412366503100095e-05, + "loss": 2.9498, + "step": 1657500 + }, + { + "epoch": 0.515413442094481, + "grad_norm": 8.367121696472168, + "learning_rate": 4.140977596509198e-05, + "loss": 2.9803, + "step": 1658000 + }, + { + "epoch": 0.5155688743749679, + "grad_norm": 16.46213150024414, + "learning_rate": 4.140718542708387e-05, + "loss": 2.9388, + "step": 1658500 + }, + { + "epoch": 0.5157243066554548, + "grad_norm": 7.490367889404297, + "learning_rate": 4.1404594889075756e-05, + "loss": 2.9593, + "step": 1659000 + }, + { + "epoch": 0.5158797389359417, + "grad_norm": 8.132136344909668, + "learning_rate": 4.1402004351067644e-05, + "loss": 2.9474, + "step": 1659500 + }, + { + "epoch": 0.5160351712164286, + "grad_norm": 41.8113899230957, + "learning_rate": 4.1399413813059524e-05, + "loss": 3.0244, + "step": 1660000 + }, + { + "epoch": 0.5161906034969155, + "grad_norm": 9.22241497039795, + "learning_rate": 4.139682327505141e-05, + "loss": 2.9491, + "step": 1660500 + }, + { + "epoch": 0.5163460357774023, + "grad_norm": 8.665870666503906, + "learning_rate": 4.13942327370433e-05, + "loss": 2.9773, + "step": 1661000 + }, + { + "epoch": 0.5165014680578892, + "grad_norm": 10.71556568145752, + "learning_rate": 4.139164219903518e-05, + "loss": 2.9837, + "step": 1661500 + }, + { + "epoch": 0.5166569003383761, + "grad_norm": 9.194273948669434, + "learning_rate": 4.1389051661027066e-05, + "loss": 2.9556, + "step": 1662000 + }, + { + "epoch": 0.516812332618863, + "grad_norm": 7.8306965827941895, + "learning_rate": 4.138646112301895e-05, + "loss": 2.9654, + "step": 1662500 + }, + { + "epoch": 0.5169677648993498, + "grad_norm": 10.61097526550293, + "learning_rate": 4.138387058501083e-05, + "loss": 2.9438, + "step": 1663000 + }, + { + "epoch": 0.5171231971798367, + "grad_norm": 14.161703109741211, + "learning_rate": 4.138128004700273e-05, + "loss": 2.9649, + "step": 1663500 + }, + { + "epoch": 0.5172786294603235, + "grad_norm": 7.986491680145264, + "learning_rate": 4.1378689508994614e-05, + "loss": 3.0239, + "step": 1664000 + }, + { + "epoch": 0.5174340617408104, + "grad_norm": 7.5975542068481445, + "learning_rate": 4.1376098970986495e-05, + "loss": 3.0145, + "step": 1664500 + }, + { + "epoch": 0.5175894940212973, + "grad_norm": 10.603043556213379, + "learning_rate": 4.137350843297838e-05, + "loss": 2.9507, + "step": 1665000 + }, + { + "epoch": 0.5177449263017843, + "grad_norm": 8.934953689575195, + "learning_rate": 4.137091789497026e-05, + "loss": 2.9645, + "step": 1665500 + }, + { + "epoch": 0.5179003585822711, + "grad_norm": 6.844863414764404, + "learning_rate": 4.136832735696215e-05, + "loss": 2.9886, + "step": 1666000 + }, + { + "epoch": 0.518055790862758, + "grad_norm": 8.701683044433594, + "learning_rate": 4.136573681895404e-05, + "loss": 2.9865, + "step": 1666500 + }, + { + "epoch": 0.5182112231432449, + "grad_norm": 8.856301307678223, + "learning_rate": 4.136314628094592e-05, + "loss": 2.9864, + "step": 1667000 + }, + { + "epoch": 0.5183666554237317, + "grad_norm": 18.905710220336914, + "learning_rate": 4.1360555742937804e-05, + "loss": 2.9649, + "step": 1667500 + }, + { + "epoch": 0.5185220877042186, + "grad_norm": 8.760897636413574, + "learning_rate": 4.135796520492969e-05, + "loss": 2.9816, + "step": 1668000 + }, + { + "epoch": 0.5186775199847055, + "grad_norm": 9.566031455993652, + "learning_rate": 4.135537466692158e-05, + "loss": 2.9977, + "step": 1668500 + }, + { + "epoch": 0.5188329522651923, + "grad_norm": 8.340035438537598, + "learning_rate": 4.1352784128913466e-05, + "loss": 2.9913, + "step": 1669000 + }, + { + "epoch": 0.5189883845456792, + "grad_norm": 9.20612621307373, + "learning_rate": 4.135019359090535e-05, + "loss": 3.0521, + "step": 1669500 + }, + { + "epoch": 0.5191438168261661, + "grad_norm": 11.014350891113281, + "learning_rate": 4.134760305289723e-05, + "loss": 2.9416, + "step": 1670000 + }, + { + "epoch": 0.5192992491066529, + "grad_norm": 31.544527053833008, + "learning_rate": 4.134501251488912e-05, + "loss": 2.9896, + "step": 1670500 + }, + { + "epoch": 0.5194546813871398, + "grad_norm": 8.189542770385742, + "learning_rate": 4.134242197688101e-05, + "loss": 2.9834, + "step": 1671000 + }, + { + "epoch": 0.5196101136676268, + "grad_norm": 7.278792381286621, + "learning_rate": 4.133983143887289e-05, + "loss": 2.9516, + "step": 1671500 + }, + { + "epoch": 0.5197655459481136, + "grad_norm": 9.042860984802246, + "learning_rate": 4.1337240900864775e-05, + "loss": 2.9797, + "step": 1672000 + }, + { + "epoch": 0.5199209782286005, + "grad_norm": 9.362375259399414, + "learning_rate": 4.1334650362856655e-05, + "loss": 2.9813, + "step": 1672500 + }, + { + "epoch": 0.5200764105090874, + "grad_norm": 7.043313026428223, + "learning_rate": 4.133205982484854e-05, + "loss": 2.9797, + "step": 1673000 + }, + { + "epoch": 0.5202318427895742, + "grad_norm": 10.092191696166992, + "learning_rate": 4.1329469286840436e-05, + "loss": 2.9732, + "step": 1673500 + }, + { + "epoch": 0.5203872750700611, + "grad_norm": 8.273961067199707, + "learning_rate": 4.132687874883232e-05, + "loss": 2.9804, + "step": 1674000 + }, + { + "epoch": 0.520542707350548, + "grad_norm": 7.927803993225098, + "learning_rate": 4.1324288210824204e-05, + "loss": 2.9344, + "step": 1674500 + }, + { + "epoch": 0.5206981396310348, + "grad_norm": 10.36579704284668, + "learning_rate": 4.132169767281609e-05, + "loss": 2.9878, + "step": 1675000 + }, + { + "epoch": 0.5208535719115217, + "grad_norm": 7.637599468231201, + "learning_rate": 4.131910713480797e-05, + "loss": 2.9949, + "step": 1675500 + }, + { + "epoch": 0.5210090041920086, + "grad_norm": 9.794143676757812, + "learning_rate": 4.131651659679986e-05, + "loss": 2.9492, + "step": 1676000 + }, + { + "epoch": 0.5211644364724954, + "grad_norm": 8.725481033325195, + "learning_rate": 4.1313926058791746e-05, + "loss": 2.9399, + "step": 1676500 + }, + { + "epoch": 0.5213198687529823, + "grad_norm": 13.024754524230957, + "learning_rate": 4.1311335520783626e-05, + "loss": 2.9685, + "step": 1677000 + }, + { + "epoch": 0.5214753010334693, + "grad_norm": 8.703847885131836, + "learning_rate": 4.130874498277551e-05, + "loss": 2.9788, + "step": 1677500 + }, + { + "epoch": 0.5216307333139562, + "grad_norm": 8.46328067779541, + "learning_rate": 4.13061544447674e-05, + "loss": 3.0086, + "step": 1678000 + }, + { + "epoch": 0.521786165594443, + "grad_norm": 10.195046424865723, + "learning_rate": 4.130356390675929e-05, + "loss": 2.986, + "step": 1678500 + }, + { + "epoch": 0.5219415978749299, + "grad_norm": 6.775644779205322, + "learning_rate": 4.1300973368751175e-05, + "loss": 2.982, + "step": 1679000 + }, + { + "epoch": 0.5220970301554168, + "grad_norm": 9.619054794311523, + "learning_rate": 4.1298382830743055e-05, + "loss": 3.0174, + "step": 1679500 + }, + { + "epoch": 0.5222524624359036, + "grad_norm": 22.47823143005371, + "learning_rate": 4.129579229273494e-05, + "loss": 3.0184, + "step": 1680000 + }, + { + "epoch": 0.5224078947163905, + "grad_norm": 10.837433815002441, + "learning_rate": 4.129320175472683e-05, + "loss": 2.9688, + "step": 1680500 + }, + { + "epoch": 0.5225633269968774, + "grad_norm": 9.543180465698242, + "learning_rate": 4.129061121671871e-05, + "loss": 3.0043, + "step": 1681000 + }, + { + "epoch": 0.5227187592773642, + "grad_norm": 8.137543678283691, + "learning_rate": 4.12880206787106e-05, + "loss": 2.9984, + "step": 1681500 + }, + { + "epoch": 0.5228741915578511, + "grad_norm": 8.196511268615723, + "learning_rate": 4.1285430140702484e-05, + "loss": 2.9481, + "step": 1682000 + }, + { + "epoch": 0.523029623838338, + "grad_norm": 8.931879997253418, + "learning_rate": 4.1282839602694365e-05, + "loss": 2.9929, + "step": 1682500 + }, + { + "epoch": 0.5231850561188248, + "grad_norm": 7.392938137054443, + "learning_rate": 4.128024906468625e-05, + "loss": 3.047, + "step": 1683000 + }, + { + "epoch": 0.5233404883993118, + "grad_norm": 8.65135669708252, + "learning_rate": 4.127765852667814e-05, + "loss": 2.9493, + "step": 1683500 + }, + { + "epoch": 0.5234959206797987, + "grad_norm": 19.403303146362305, + "learning_rate": 4.1275067988670026e-05, + "loss": 2.9899, + "step": 1684000 + }, + { + "epoch": 0.5236513529602855, + "grad_norm": 10.971894264221191, + "learning_rate": 4.127247745066191e-05, + "loss": 2.9824, + "step": 1684500 + }, + { + "epoch": 0.5238067852407724, + "grad_norm": 9.60467529296875, + "learning_rate": 4.1269886912653794e-05, + "loss": 2.958, + "step": 1685000 + }, + { + "epoch": 0.5239622175212593, + "grad_norm": 8.726387977600098, + "learning_rate": 4.126729637464568e-05, + "loss": 3.0061, + "step": 1685500 + }, + { + "epoch": 0.5241176498017461, + "grad_norm": 8.451407432556152, + "learning_rate": 4.126470583663757e-05, + "loss": 3.0279, + "step": 1686000 + }, + { + "epoch": 0.524273082082233, + "grad_norm": 8.460199356079102, + "learning_rate": 4.126211529862945e-05, + "loss": 3.009, + "step": 1686500 + }, + { + "epoch": 0.5244285143627199, + "grad_norm": 10.152080535888672, + "learning_rate": 4.1259524760621335e-05, + "loss": 2.9595, + "step": 1687000 + }, + { + "epoch": 0.5245839466432067, + "grad_norm": 8.25213623046875, + "learning_rate": 4.125693422261322e-05, + "loss": 2.9238, + "step": 1687500 + }, + { + "epoch": 0.5247393789236936, + "grad_norm": 9.922224044799805, + "learning_rate": 4.125434368460511e-05, + "loss": 2.9434, + "step": 1688000 + }, + { + "epoch": 0.5248948112041805, + "grad_norm": 8.557171821594238, + "learning_rate": 4.1251753146597e-05, + "loss": 2.9741, + "step": 1688500 + }, + { + "epoch": 0.5250502434846673, + "grad_norm": 15.166727066040039, + "learning_rate": 4.1249162608588884e-05, + "loss": 2.9541, + "step": 1689000 + }, + { + "epoch": 0.5252056757651543, + "grad_norm": 10.065393447875977, + "learning_rate": 4.1246572070580764e-05, + "loss": 2.9803, + "step": 1689500 + }, + { + "epoch": 0.5253611080456412, + "grad_norm": 9.149473190307617, + "learning_rate": 4.124398153257265e-05, + "loss": 3.0123, + "step": 1690000 + }, + { + "epoch": 0.525516540326128, + "grad_norm": 7.465279579162598, + "learning_rate": 4.124139099456453e-05, + "loss": 3.0127, + "step": 1690500 + }, + { + "epoch": 0.5256719726066149, + "grad_norm": 9.006736755371094, + "learning_rate": 4.123880045655642e-05, + "loss": 2.9727, + "step": 1691000 + }, + { + "epoch": 0.5258274048871018, + "grad_norm": 14.183250427246094, + "learning_rate": 4.1236209918548306e-05, + "loss": 2.9741, + "step": 1691500 + }, + { + "epoch": 0.5259828371675886, + "grad_norm": 9.977410316467285, + "learning_rate": 4.1233619380540187e-05, + "loss": 2.9776, + "step": 1692000 + }, + { + "epoch": 0.5261382694480755, + "grad_norm": 6.641258239746094, + "learning_rate": 4.1231028842532074e-05, + "loss": 3.0097, + "step": 1692500 + }, + { + "epoch": 0.5262937017285624, + "grad_norm": 13.408272743225098, + "learning_rate": 4.122843830452396e-05, + "loss": 2.9867, + "step": 1693000 + }, + { + "epoch": 0.5264491340090492, + "grad_norm": 26.189355850219727, + "learning_rate": 4.122584776651585e-05, + "loss": 2.9824, + "step": 1693500 + }, + { + "epoch": 0.5266045662895361, + "grad_norm": 6.427492618560791, + "learning_rate": 4.1223257228507735e-05, + "loss": 3.014, + "step": 1694000 + }, + { + "epoch": 0.526759998570023, + "grad_norm": 7.308610439300537, + "learning_rate": 4.122066669049962e-05, + "loss": 3.0034, + "step": 1694500 + }, + { + "epoch": 0.5269154308505098, + "grad_norm": 7.932776927947998, + "learning_rate": 4.12180761524915e-05, + "loss": 2.9728, + "step": 1695000 + }, + { + "epoch": 0.5270708631309968, + "grad_norm": 11.507081031799316, + "learning_rate": 4.121548561448339e-05, + "loss": 2.9955, + "step": 1695500 + }, + { + "epoch": 0.5272262954114837, + "grad_norm": 9.154585838317871, + "learning_rate": 4.121289507647527e-05, + "loss": 3.0025, + "step": 1696000 + }, + { + "epoch": 0.5273817276919706, + "grad_norm": 7.995645523071289, + "learning_rate": 4.121030453846716e-05, + "loss": 2.9579, + "step": 1696500 + }, + { + "epoch": 0.5275371599724574, + "grad_norm": 8.932352066040039, + "learning_rate": 4.1207714000459045e-05, + "loss": 2.9796, + "step": 1697000 + }, + { + "epoch": 0.5276925922529443, + "grad_norm": 7.695853233337402, + "learning_rate": 4.120512346245093e-05, + "loss": 2.9309, + "step": 1697500 + }, + { + "epoch": 0.5278480245334312, + "grad_norm": 15.064197540283203, + "learning_rate": 4.120253292444282e-05, + "loss": 3.0001, + "step": 1698000 + }, + { + "epoch": 0.528003456813918, + "grad_norm": 10.040380477905273, + "learning_rate": 4.1199942386434706e-05, + "loss": 2.9777, + "step": 1698500 + }, + { + "epoch": 0.5281588890944049, + "grad_norm": 8.564092636108398, + "learning_rate": 4.1197351848426586e-05, + "loss": 2.9867, + "step": 1699000 + }, + { + "epoch": 0.5283143213748918, + "grad_norm": 8.42081069946289, + "learning_rate": 4.1194761310418474e-05, + "loss": 3.0118, + "step": 1699500 + }, + { + "epoch": 0.5284697536553786, + "grad_norm": 9.507670402526855, + "learning_rate": 4.119217077241036e-05, + "loss": 2.9934, + "step": 1700000 + }, + { + "epoch": 0.5286251859358655, + "grad_norm": 10.771012306213379, + "learning_rate": 4.118958023440224e-05, + "loss": 3.0021, + "step": 1700500 + }, + { + "epoch": 0.5287806182163524, + "grad_norm": 8.244629859924316, + "learning_rate": 4.118698969639413e-05, + "loss": 2.9848, + "step": 1701000 + }, + { + "epoch": 0.5289360504968393, + "grad_norm": 8.917434692382812, + "learning_rate": 4.118439915838601e-05, + "loss": 3.0209, + "step": 1701500 + }, + { + "epoch": 0.5290914827773262, + "grad_norm": 6.6024556159973145, + "learning_rate": 4.1181808620377896e-05, + "loss": 2.975, + "step": 1702000 + }, + { + "epoch": 0.5292469150578131, + "grad_norm": 11.382304191589355, + "learning_rate": 4.117921808236978e-05, + "loss": 2.9699, + "step": 1702500 + }, + { + "epoch": 0.5294023473382999, + "grad_norm": 15.093950271606445, + "learning_rate": 4.117662754436167e-05, + "loss": 2.9605, + "step": 1703000 + }, + { + "epoch": 0.5295577796187868, + "grad_norm": 10.478306770324707, + "learning_rate": 4.117403700635356e-05, + "loss": 2.9299, + "step": 1703500 + }, + { + "epoch": 0.5297132118992737, + "grad_norm": 7.333211898803711, + "learning_rate": 4.1171446468345444e-05, + "loss": 3.0071, + "step": 1704000 + }, + { + "epoch": 0.5298686441797605, + "grad_norm": 10.923650741577148, + "learning_rate": 4.1168855930337325e-05, + "loss": 2.9626, + "step": 1704500 + }, + { + "epoch": 0.5300240764602474, + "grad_norm": 8.326359748840332, + "learning_rate": 4.116626539232921e-05, + "loss": 3.0352, + "step": 1705000 + }, + { + "epoch": 0.5301795087407343, + "grad_norm": 8.852001190185547, + "learning_rate": 4.11636748543211e-05, + "loss": 2.9804, + "step": 1705500 + }, + { + "epoch": 0.5303349410212211, + "grad_norm": 13.5582275390625, + "learning_rate": 4.116108431631298e-05, + "loss": 3.0219, + "step": 1706000 + }, + { + "epoch": 0.530490373301708, + "grad_norm": 7.886613368988037, + "learning_rate": 4.115849377830487e-05, + "loss": 2.9741, + "step": 1706500 + }, + { + "epoch": 0.5306458055821949, + "grad_norm": 9.907084465026855, + "learning_rate": 4.1155903240296754e-05, + "loss": 2.999, + "step": 1707000 + }, + { + "epoch": 0.5308012378626819, + "grad_norm": 7.38801383972168, + "learning_rate": 4.115331270228864e-05, + "loss": 2.9604, + "step": 1707500 + }, + { + "epoch": 0.5309566701431687, + "grad_norm": 8.359247207641602, + "learning_rate": 4.115072216428053e-05, + "loss": 2.9912, + "step": 1708000 + }, + { + "epoch": 0.5311121024236556, + "grad_norm": 17.237491607666016, + "learning_rate": 4.114813162627241e-05, + "loss": 3.0311, + "step": 1708500 + }, + { + "epoch": 0.5312675347041425, + "grad_norm": 41.85712432861328, + "learning_rate": 4.1145541088264296e-05, + "loss": 2.9922, + "step": 1709000 + }, + { + "epoch": 0.5314229669846293, + "grad_norm": 5.701825141906738, + "learning_rate": 4.114295055025618e-05, + "loss": 2.9761, + "step": 1709500 + }, + { + "epoch": 0.5315783992651162, + "grad_norm": 8.064556121826172, + "learning_rate": 4.114036001224806e-05, + "loss": 2.9858, + "step": 1710000 + }, + { + "epoch": 0.531733831545603, + "grad_norm": 15.252641677856445, + "learning_rate": 4.113776947423995e-05, + "loss": 2.9494, + "step": 1710500 + }, + { + "epoch": 0.5318892638260899, + "grad_norm": 7.098143100738525, + "learning_rate": 4.113517893623184e-05, + "loss": 2.9585, + "step": 1711000 + }, + { + "epoch": 0.5320446961065768, + "grad_norm": 5.862090587615967, + "learning_rate": 4.113258839822372e-05, + "loss": 3.0095, + "step": 1711500 + }, + { + "epoch": 0.5322001283870637, + "grad_norm": 9.204343795776367, + "learning_rate": 4.1129997860215605e-05, + "loss": 2.925, + "step": 1712000 + }, + { + "epoch": 0.5323555606675505, + "grad_norm": 8.195585250854492, + "learning_rate": 4.112740732220749e-05, + "loss": 2.9556, + "step": 1712500 + }, + { + "epoch": 0.5325109929480374, + "grad_norm": 6.381181240081787, + "learning_rate": 4.112481678419938e-05, + "loss": 2.9512, + "step": 1713000 + }, + { + "epoch": 0.5326664252285244, + "grad_norm": 10.409774780273438, + "learning_rate": 4.1122226246191266e-05, + "loss": 3.005, + "step": 1713500 + }, + { + "epoch": 0.5328218575090112, + "grad_norm": 10.19198989868164, + "learning_rate": 4.111963570818315e-05, + "loss": 2.9811, + "step": 1714000 + }, + { + "epoch": 0.5329772897894981, + "grad_norm": 9.118926048278809, + "learning_rate": 4.1117045170175034e-05, + "loss": 2.9761, + "step": 1714500 + }, + { + "epoch": 0.533132722069985, + "grad_norm": 9.777298927307129, + "learning_rate": 4.111445463216692e-05, + "loss": 3.0016, + "step": 1715000 + }, + { + "epoch": 0.5332881543504718, + "grad_norm": 9.21190357208252, + "learning_rate": 4.11118640941588e-05, + "loss": 2.9889, + "step": 1715500 + }, + { + "epoch": 0.5334435866309587, + "grad_norm": 21.732240676879883, + "learning_rate": 4.110927355615069e-05, + "loss": 2.9252, + "step": 1716000 + }, + { + "epoch": 0.5335990189114456, + "grad_norm": 26.54075813293457, + "learning_rate": 4.1106683018142576e-05, + "loss": 2.9898, + "step": 1716500 + }, + { + "epoch": 0.5337544511919324, + "grad_norm": 7.435620307922363, + "learning_rate": 4.110409248013446e-05, + "loss": 2.9871, + "step": 1717000 + }, + { + "epoch": 0.5339098834724193, + "grad_norm": 6.960503578186035, + "learning_rate": 4.110150194212635e-05, + "loss": 2.9905, + "step": 1717500 + }, + { + "epoch": 0.5340653157529062, + "grad_norm": 7.041342258453369, + "learning_rate": 4.109891140411824e-05, + "loss": 3.0108, + "step": 1718000 + }, + { + "epoch": 0.534220748033393, + "grad_norm": 10.069799423217773, + "learning_rate": 4.109632086611012e-05, + "loss": 2.9932, + "step": 1718500 + }, + { + "epoch": 0.5343761803138799, + "grad_norm": 7.379876136779785, + "learning_rate": 4.1093730328102005e-05, + "loss": 2.9831, + "step": 1719000 + }, + { + "epoch": 0.5345316125943669, + "grad_norm": 9.17121410369873, + "learning_rate": 4.1091139790093885e-05, + "loss": 2.9586, + "step": 1719500 + }, + { + "epoch": 0.5346870448748537, + "grad_norm": 8.581711769104004, + "learning_rate": 4.108854925208577e-05, + "loss": 2.9672, + "step": 1720000 + }, + { + "epoch": 0.5348424771553406, + "grad_norm": 7.683795928955078, + "learning_rate": 4.108595871407766e-05, + "loss": 2.988, + "step": 1720500 + }, + { + "epoch": 0.5349979094358275, + "grad_norm": 11.94758415222168, + "learning_rate": 4.108336817606954e-05, + "loss": 2.926, + "step": 1721000 + }, + { + "epoch": 0.5351533417163143, + "grad_norm": 7.583193778991699, + "learning_rate": 4.108077763806143e-05, + "loss": 2.9456, + "step": 1721500 + }, + { + "epoch": 0.5353087739968012, + "grad_norm": 9.52140998840332, + "learning_rate": 4.1078187100053314e-05, + "loss": 3.0041, + "step": 1722000 + }, + { + "epoch": 0.5354642062772881, + "grad_norm": 11.221306800842285, + "learning_rate": 4.10755965620452e-05, + "loss": 3.0307, + "step": 1722500 + }, + { + "epoch": 0.535619638557775, + "grad_norm": 8.400775909423828, + "learning_rate": 4.107300602403709e-05, + "loss": 2.9933, + "step": 1723000 + }, + { + "epoch": 0.5357750708382618, + "grad_norm": 6.976308822631836, + "learning_rate": 4.1070415486028976e-05, + "loss": 2.9753, + "step": 1723500 + }, + { + "epoch": 0.5359305031187487, + "grad_norm": 14.094533920288086, + "learning_rate": 4.1067824948020856e-05, + "loss": 2.9933, + "step": 1724000 + }, + { + "epoch": 0.5360859353992355, + "grad_norm": 8.451906204223633, + "learning_rate": 4.106523441001274e-05, + "loss": 2.9866, + "step": 1724500 + }, + { + "epoch": 0.5362413676797224, + "grad_norm": 7.933333873748779, + "learning_rate": 4.106264387200463e-05, + "loss": 2.9816, + "step": 1725000 + }, + { + "epoch": 0.5363967999602094, + "grad_norm": 9.356027603149414, + "learning_rate": 4.106005333399651e-05, + "loss": 3.009, + "step": 1725500 + }, + { + "epoch": 0.5365522322406963, + "grad_norm": 11.331621170043945, + "learning_rate": 4.10574627959884e-05, + "loss": 2.9694, + "step": 1726000 + }, + { + "epoch": 0.5367076645211831, + "grad_norm": 6.684274673461914, + "learning_rate": 4.105487225798028e-05, + "loss": 2.9707, + "step": 1726500 + }, + { + "epoch": 0.53686309680167, + "grad_norm": 9.840812683105469, + "learning_rate": 4.105228171997217e-05, + "loss": 3.0236, + "step": 1727000 + }, + { + "epoch": 0.5370185290821569, + "grad_norm": 9.128630638122559, + "learning_rate": 4.104969118196406e-05, + "loss": 2.9934, + "step": 1727500 + }, + { + "epoch": 0.5371739613626437, + "grad_norm": 7.441094398498535, + "learning_rate": 4.104710064395594e-05, + "loss": 2.9468, + "step": 1728000 + }, + { + "epoch": 0.5373293936431306, + "grad_norm": 6.76605749130249, + "learning_rate": 4.104451010594783e-05, + "loss": 2.9513, + "step": 1728500 + }, + { + "epoch": 0.5374848259236175, + "grad_norm": 8.75838851928711, + "learning_rate": 4.1041919567939714e-05, + "loss": 3.0185, + "step": 1729000 + }, + { + "epoch": 0.5376402582041043, + "grad_norm": 26.70358657836914, + "learning_rate": 4.1039329029931594e-05, + "loss": 2.9965, + "step": 1729500 + }, + { + "epoch": 0.5377956904845912, + "grad_norm": 8.990678787231445, + "learning_rate": 4.103673849192348e-05, + "loss": 2.9635, + "step": 1730000 + }, + { + "epoch": 0.5379511227650781, + "grad_norm": 7.1044697761535645, + "learning_rate": 4.103414795391537e-05, + "loss": 2.9366, + "step": 1730500 + }, + { + "epoch": 0.5381065550455649, + "grad_norm": 7.039046287536621, + "learning_rate": 4.103155741590725e-05, + "loss": 2.9433, + "step": 1731000 + }, + { + "epoch": 0.5382619873260518, + "grad_norm": 8.659283638000488, + "learning_rate": 4.1028966877899136e-05, + "loss": 3.0084, + "step": 1731500 + }, + { + "epoch": 0.5384174196065388, + "grad_norm": 8.894889831542969, + "learning_rate": 4.102637633989102e-05, + "loss": 2.9725, + "step": 1732000 + }, + { + "epoch": 0.5385728518870256, + "grad_norm": 7.594513416290283, + "learning_rate": 4.102378580188291e-05, + "loss": 3.0021, + "step": 1732500 + }, + { + "epoch": 0.5387282841675125, + "grad_norm": 6.630942344665527, + "learning_rate": 4.10211952638748e-05, + "loss": 2.9803, + "step": 1733000 + }, + { + "epoch": 0.5388837164479994, + "grad_norm": 10.150473594665527, + "learning_rate": 4.101860472586668e-05, + "loss": 3.0048, + "step": 1733500 + }, + { + "epoch": 0.5390391487284862, + "grad_norm": 7.5963521003723145, + "learning_rate": 4.1016014187858565e-05, + "loss": 2.9992, + "step": 1734000 + }, + { + "epoch": 0.5391945810089731, + "grad_norm": 8.107840538024902, + "learning_rate": 4.101342364985045e-05, + "loss": 2.9689, + "step": 1734500 + }, + { + "epoch": 0.53935001328946, + "grad_norm": 7.8987016677856445, + "learning_rate": 4.101083311184233e-05, + "loss": 3.0018, + "step": 1735000 + }, + { + "epoch": 0.5395054455699468, + "grad_norm": 10.261829376220703, + "learning_rate": 4.100824257383422e-05, + "loss": 2.9714, + "step": 1735500 + }, + { + "epoch": 0.5396608778504337, + "grad_norm": 11.936629295349121, + "learning_rate": 4.100565203582611e-05, + "loss": 2.9875, + "step": 1736000 + }, + { + "epoch": 0.5398163101309206, + "grad_norm": 16.77789878845215, + "learning_rate": 4.100306149781799e-05, + "loss": 2.9858, + "step": 1736500 + }, + { + "epoch": 0.5399717424114074, + "grad_norm": 8.88807201385498, + "learning_rate": 4.100047095980988e-05, + "loss": 2.9394, + "step": 1737000 + }, + { + "epoch": 0.5401271746918943, + "grad_norm": 9.113791465759277, + "learning_rate": 4.099788042180177e-05, + "loss": 2.932, + "step": 1737500 + }, + { + "epoch": 0.5402826069723813, + "grad_norm": 11.588233947753906, + "learning_rate": 4.099528988379365e-05, + "loss": 2.9598, + "step": 1738000 + }, + { + "epoch": 0.5404380392528682, + "grad_norm": 10.491077423095703, + "learning_rate": 4.0992699345785536e-05, + "loss": 3.0226, + "step": 1738500 + }, + { + "epoch": 0.540593471533355, + "grad_norm": 8.269988059997559, + "learning_rate": 4.0990108807777416e-05, + "loss": 2.9925, + "step": 1739000 + }, + { + "epoch": 0.5407489038138419, + "grad_norm": 6.459964275360107, + "learning_rate": 4.0987518269769304e-05, + "loss": 2.9728, + "step": 1739500 + }, + { + "epoch": 0.5409043360943288, + "grad_norm": 7.8664469718933105, + "learning_rate": 4.098492773176119e-05, + "loss": 2.9561, + "step": 1740000 + }, + { + "epoch": 0.5410597683748156, + "grad_norm": 8.14901065826416, + "learning_rate": 4.098233719375307e-05, + "loss": 2.9771, + "step": 1740500 + }, + { + "epoch": 0.5412152006553025, + "grad_norm": 14.560667991638184, + "learning_rate": 4.097974665574496e-05, + "loss": 2.9805, + "step": 1741000 + }, + { + "epoch": 0.5413706329357894, + "grad_norm": 5.996594429016113, + "learning_rate": 4.0977156117736845e-05, + "loss": 2.9779, + "step": 1741500 + }, + { + "epoch": 0.5415260652162762, + "grad_norm": 10.13638687133789, + "learning_rate": 4.097456557972873e-05, + "loss": 3.0049, + "step": 1742000 + }, + { + "epoch": 0.5416814974967631, + "grad_norm": 8.015892028808594, + "learning_rate": 4.097197504172062e-05, + "loss": 2.9549, + "step": 1742500 + }, + { + "epoch": 0.54183692977725, + "grad_norm": 9.321145057678223, + "learning_rate": 4.096938450371251e-05, + "loss": 2.9723, + "step": 1743000 + }, + { + "epoch": 0.5419923620577368, + "grad_norm": 6.826521396636963, + "learning_rate": 4.096679396570439e-05, + "loss": 3.0, + "step": 1743500 + }, + { + "epoch": 0.5421477943382238, + "grad_norm": 9.890545845031738, + "learning_rate": 4.0964203427696274e-05, + "loss": 2.9996, + "step": 1744000 + }, + { + "epoch": 0.5423032266187107, + "grad_norm": 14.069190979003906, + "learning_rate": 4.0961612889688155e-05, + "loss": 2.9711, + "step": 1744500 + }, + { + "epoch": 0.5424586588991975, + "grad_norm": 9.153460502624512, + "learning_rate": 4.095902235168004e-05, + "loss": 2.9549, + "step": 1745000 + }, + { + "epoch": 0.5426140911796844, + "grad_norm": 7.47831392288208, + "learning_rate": 4.095643181367193e-05, + "loss": 2.991, + "step": 1745500 + }, + { + "epoch": 0.5427695234601713, + "grad_norm": 11.158863067626953, + "learning_rate": 4.095384127566381e-05, + "loss": 3.0142, + "step": 1746000 + }, + { + "epoch": 0.5429249557406581, + "grad_norm": 8.991155624389648, + "learning_rate": 4.0951250737655697e-05, + "loss": 3.0353, + "step": 1746500 + }, + { + "epoch": 0.543080388021145, + "grad_norm": 10.10586166381836, + "learning_rate": 4.094866019964759e-05, + "loss": 2.9546, + "step": 1747000 + }, + { + "epoch": 0.5432358203016319, + "grad_norm": 9.65713882446289, + "learning_rate": 4.094606966163947e-05, + "loss": 2.987, + "step": 1747500 + }, + { + "epoch": 0.5433912525821187, + "grad_norm": 8.598947525024414, + "learning_rate": 4.094347912363136e-05, + "loss": 2.9961, + "step": 1748000 + }, + { + "epoch": 0.5435466848626056, + "grad_norm": 14.167969703674316, + "learning_rate": 4.0940888585623245e-05, + "loss": 2.9613, + "step": 1748500 + }, + { + "epoch": 0.5437021171430925, + "grad_norm": 7.510716438293457, + "learning_rate": 4.0938298047615126e-05, + "loss": 2.9673, + "step": 1749000 + }, + { + "epoch": 0.5438575494235793, + "grad_norm": 15.541986465454102, + "learning_rate": 4.093570750960701e-05, + "loss": 2.9931, + "step": 1749500 + }, + { + "epoch": 0.5440129817040663, + "grad_norm": 7.909501552581787, + "learning_rate": 4.093311697159889e-05, + "loss": 2.9353, + "step": 1750000 + }, + { + "epoch": 0.5441684139845532, + "grad_norm": 8.22928524017334, + "learning_rate": 4.093052643359078e-05, + "loss": 2.9995, + "step": 1750500 + }, + { + "epoch": 0.54432384626504, + "grad_norm": 7.314515113830566, + "learning_rate": 4.092793589558267e-05, + "loss": 2.9543, + "step": 1751000 + }, + { + "epoch": 0.5444792785455269, + "grad_norm": 7.6573333740234375, + "learning_rate": 4.0925345357574555e-05, + "loss": 3.0159, + "step": 1751500 + }, + { + "epoch": 0.5446347108260138, + "grad_norm": 8.73977279663086, + "learning_rate": 4.092275481956644e-05, + "loss": 2.9677, + "step": 1752000 + }, + { + "epoch": 0.5447901431065006, + "grad_norm": 9.025787353515625, + "learning_rate": 4.092016428155833e-05, + "loss": 2.9326, + "step": 1752500 + }, + { + "epoch": 0.5449455753869875, + "grad_norm": 7.152261257171631, + "learning_rate": 4.091757374355021e-05, + "loss": 2.9932, + "step": 1753000 + }, + { + "epoch": 0.5451010076674744, + "grad_norm": 7.623671054840088, + "learning_rate": 4.0914983205542096e-05, + "loss": 2.9221, + "step": 1753500 + }, + { + "epoch": 0.5452564399479612, + "grad_norm": 7.735680103302002, + "learning_rate": 4.0912392667533984e-05, + "loss": 2.9271, + "step": 1754000 + }, + { + "epoch": 0.5454118722284481, + "grad_norm": 15.168661117553711, + "learning_rate": 4.0909802129525864e-05, + "loss": 2.957, + "step": 1754500 + }, + { + "epoch": 0.545567304508935, + "grad_norm": 10.07080078125, + "learning_rate": 4.090721159151775e-05, + "loss": 3.0026, + "step": 1755000 + }, + { + "epoch": 0.5457227367894218, + "grad_norm": 8.284982681274414, + "learning_rate": 4.090462105350964e-05, + "loss": 2.9498, + "step": 1755500 + }, + { + "epoch": 0.5458781690699088, + "grad_norm": 7.961293697357178, + "learning_rate": 4.090203051550152e-05, + "loss": 3.0045, + "step": 1756000 + }, + { + "epoch": 0.5460336013503957, + "grad_norm": 7.328734874725342, + "learning_rate": 4.0899439977493406e-05, + "loss": 2.9828, + "step": 1756500 + }, + { + "epoch": 0.5461890336308826, + "grad_norm": 6.838059902191162, + "learning_rate": 4.089684943948529e-05, + "loss": 3.0331, + "step": 1757000 + }, + { + "epoch": 0.5463444659113694, + "grad_norm": 8.066521644592285, + "learning_rate": 4.089425890147718e-05, + "loss": 2.9803, + "step": 1757500 + }, + { + "epoch": 0.5464998981918563, + "grad_norm": 25.703725814819336, + "learning_rate": 4.089166836346907e-05, + "loss": 2.9617, + "step": 1758000 + }, + { + "epoch": 0.5466553304723432, + "grad_norm": 11.530008316040039, + "learning_rate": 4.088907782546095e-05, + "loss": 3.0055, + "step": 1758500 + }, + { + "epoch": 0.54681076275283, + "grad_norm": 8.739970207214355, + "learning_rate": 4.0886487287452835e-05, + "loss": 2.9531, + "step": 1759000 + }, + { + "epoch": 0.5469661950333169, + "grad_norm": 8.914090156555176, + "learning_rate": 4.088389674944472e-05, + "loss": 2.9905, + "step": 1759500 + }, + { + "epoch": 0.5471216273138038, + "grad_norm": 21.5089054107666, + "learning_rate": 4.08813062114366e-05, + "loss": 2.9418, + "step": 1760000 + }, + { + "epoch": 0.5472770595942906, + "grad_norm": 7.929678916931152, + "learning_rate": 4.087871567342849e-05, + "loss": 2.98, + "step": 1760500 + }, + { + "epoch": 0.5474324918747775, + "grad_norm": 8.473624229431152, + "learning_rate": 4.0876125135420377e-05, + "loss": 2.9502, + "step": 1761000 + }, + { + "epoch": 0.5475879241552644, + "grad_norm": 7.353448390960693, + "learning_rate": 4.0873534597412264e-05, + "loss": 2.9899, + "step": 1761500 + }, + { + "epoch": 0.5477433564357513, + "grad_norm": 7.966104030609131, + "learning_rate": 4.087094405940415e-05, + "loss": 2.9728, + "step": 1762000 + }, + { + "epoch": 0.5478987887162382, + "grad_norm": 25.21462631225586, + "learning_rate": 4.086835352139603e-05, + "loss": 2.9401, + "step": 1762500 + }, + { + "epoch": 0.5480542209967251, + "grad_norm": 8.30107593536377, + "learning_rate": 4.086576298338792e-05, + "loss": 2.9845, + "step": 1763000 + }, + { + "epoch": 0.5482096532772119, + "grad_norm": 6.887980937957764, + "learning_rate": 4.0863172445379806e-05, + "loss": 2.9862, + "step": 1763500 + }, + { + "epoch": 0.5483650855576988, + "grad_norm": 18.997053146362305, + "learning_rate": 4.0860581907371686e-05, + "loss": 2.9939, + "step": 1764000 + }, + { + "epoch": 0.5485205178381857, + "grad_norm": 7.285219669342041, + "learning_rate": 4.085799136936357e-05, + "loss": 3.0431, + "step": 1764500 + }, + { + "epoch": 0.5486759501186725, + "grad_norm": 6.816135883331299, + "learning_rate": 4.085540083135546e-05, + "loss": 2.9599, + "step": 1765000 + }, + { + "epoch": 0.5488313823991594, + "grad_norm": 5.520992279052734, + "learning_rate": 4.085281029334734e-05, + "loss": 2.9666, + "step": 1765500 + }, + { + "epoch": 0.5489868146796463, + "grad_norm": 7.06913948059082, + "learning_rate": 4.085021975533923e-05, + "loss": 2.9781, + "step": 1766000 + }, + { + "epoch": 0.5491422469601331, + "grad_norm": 12.628456115722656, + "learning_rate": 4.0847629217331115e-05, + "loss": 2.9528, + "step": 1766500 + }, + { + "epoch": 0.54929767924062, + "grad_norm": 10.602959632873535, + "learning_rate": 4.0845038679323e-05, + "loss": 3.0105, + "step": 1767000 + }, + { + "epoch": 0.5494531115211069, + "grad_norm": 7.435805320739746, + "learning_rate": 4.084244814131489e-05, + "loss": 2.9448, + "step": 1767500 + }, + { + "epoch": 0.5496085438015939, + "grad_norm": 9.470816612243652, + "learning_rate": 4.083985760330677e-05, + "loss": 2.926, + "step": 1768000 + }, + { + "epoch": 0.5497639760820807, + "grad_norm": 7.803100109100342, + "learning_rate": 4.083726706529866e-05, + "loss": 2.9482, + "step": 1768500 + }, + { + "epoch": 0.5499194083625676, + "grad_norm": 8.57996940612793, + "learning_rate": 4.0834676527290544e-05, + "loss": 2.9893, + "step": 1769000 + }, + { + "epoch": 0.5500748406430545, + "grad_norm": 10.475701332092285, + "learning_rate": 4.0832085989282424e-05, + "loss": 3.0036, + "step": 1769500 + }, + { + "epoch": 0.5502302729235413, + "grad_norm": 9.164433479309082, + "learning_rate": 4.082949545127431e-05, + "loss": 2.948, + "step": 1770000 + }, + { + "epoch": 0.5503857052040282, + "grad_norm": 7.553071975708008, + "learning_rate": 4.08269049132662e-05, + "loss": 3.0537, + "step": 1770500 + }, + { + "epoch": 0.550541137484515, + "grad_norm": 18.665712356567383, + "learning_rate": 4.0824314375258086e-05, + "loss": 2.9611, + "step": 1771000 + }, + { + "epoch": 0.5506965697650019, + "grad_norm": 8.539884567260742, + "learning_rate": 4.082172383724997e-05, + "loss": 2.9846, + "step": 1771500 + }, + { + "epoch": 0.5508520020454888, + "grad_norm": 10.009017944335938, + "learning_rate": 4.081913329924186e-05, + "loss": 2.9537, + "step": 1772000 + }, + { + "epoch": 0.5510074343259757, + "grad_norm": 9.531479835510254, + "learning_rate": 4.081654276123374e-05, + "loss": 2.9766, + "step": 1772500 + }, + { + "epoch": 0.5511628666064625, + "grad_norm": 9.204269409179688, + "learning_rate": 4.081395222322563e-05, + "loss": 2.945, + "step": 1773000 + }, + { + "epoch": 0.5513182988869494, + "grad_norm": 12.325326919555664, + "learning_rate": 4.0811361685217515e-05, + "loss": 2.9815, + "step": 1773500 + }, + { + "epoch": 0.5514737311674364, + "grad_norm": 19.911252975463867, + "learning_rate": 4.0808771147209395e-05, + "loss": 2.9587, + "step": 1774000 + }, + { + "epoch": 0.5516291634479232, + "grad_norm": 6.777981281280518, + "learning_rate": 4.080618060920128e-05, + "loss": 2.9908, + "step": 1774500 + }, + { + "epoch": 0.5517845957284101, + "grad_norm": 10.837761878967285, + "learning_rate": 4.080359007119316e-05, + "loss": 2.953, + "step": 1775000 + }, + { + "epoch": 0.551940028008897, + "grad_norm": 7.25539493560791, + "learning_rate": 4.080099953318505e-05, + "loss": 2.9508, + "step": 1775500 + }, + { + "epoch": 0.5520954602893838, + "grad_norm": 10.14698314666748, + "learning_rate": 4.079840899517694e-05, + "loss": 2.9494, + "step": 1776000 + }, + { + "epoch": 0.5522508925698707, + "grad_norm": 11.289834022521973, + "learning_rate": 4.0795818457168824e-05, + "loss": 2.9999, + "step": 1776500 + }, + { + "epoch": 0.5524063248503576, + "grad_norm": 7.6357421875, + "learning_rate": 4.079322791916071e-05, + "loss": 2.924, + "step": 1777000 + }, + { + "epoch": 0.5525617571308444, + "grad_norm": 19.193302154541016, + "learning_rate": 4.07906373811526e-05, + "loss": 2.9808, + "step": 1777500 + }, + { + "epoch": 0.5527171894113313, + "grad_norm": 8.27771282196045, + "learning_rate": 4.078804684314448e-05, + "loss": 2.9438, + "step": 1778000 + }, + { + "epoch": 0.5528726216918182, + "grad_norm": 8.191520690917969, + "learning_rate": 4.0785456305136366e-05, + "loss": 2.9735, + "step": 1778500 + }, + { + "epoch": 0.553028053972305, + "grad_norm": 9.737373352050781, + "learning_rate": 4.078286576712825e-05, + "loss": 2.9901, + "step": 1779000 + }, + { + "epoch": 0.5531834862527919, + "grad_norm": 8.610369682312012, + "learning_rate": 4.0780275229120134e-05, + "loss": 3.0136, + "step": 1779500 + }, + { + "epoch": 0.5533389185332789, + "grad_norm": 9.826982498168945, + "learning_rate": 4.077768469111202e-05, + "loss": 2.9306, + "step": 1780000 + }, + { + "epoch": 0.5534943508137657, + "grad_norm": 8.984294891357422, + "learning_rate": 4.07750941531039e-05, + "loss": 2.9666, + "step": 1780500 + }, + { + "epoch": 0.5536497830942526, + "grad_norm": 21.635276794433594, + "learning_rate": 4.0772503615095795e-05, + "loss": 2.9987, + "step": 1781000 + }, + { + "epoch": 0.5538052153747395, + "grad_norm": 9.37608528137207, + "learning_rate": 4.076991307708768e-05, + "loss": 2.9586, + "step": 1781500 + }, + { + "epoch": 0.5539606476552263, + "grad_norm": 8.784427642822266, + "learning_rate": 4.076732253907956e-05, + "loss": 2.9559, + "step": 1782000 + }, + { + "epoch": 0.5541160799357132, + "grad_norm": 8.167241096496582, + "learning_rate": 4.076473200107145e-05, + "loss": 2.9719, + "step": 1782500 + }, + { + "epoch": 0.5542715122162001, + "grad_norm": 13.107195854187012, + "learning_rate": 4.076214146306334e-05, + "loss": 2.9887, + "step": 1783000 + }, + { + "epoch": 0.554426944496687, + "grad_norm": 8.316802978515625, + "learning_rate": 4.075955092505522e-05, + "loss": 2.966, + "step": 1783500 + }, + { + "epoch": 0.5545823767771738, + "grad_norm": 9.74890422821045, + "learning_rate": 4.0756960387047104e-05, + "loss": 3.0056, + "step": 1784000 + }, + { + "epoch": 0.5547378090576607, + "grad_norm": 51.92408752441406, + "learning_rate": 4.075436984903899e-05, + "loss": 2.947, + "step": 1784500 + }, + { + "epoch": 0.5548932413381475, + "grad_norm": 6.731983184814453, + "learning_rate": 4.075177931103087e-05, + "loss": 2.9723, + "step": 1785000 + }, + { + "epoch": 0.5550486736186344, + "grad_norm": 11.2239990234375, + "learning_rate": 4.074918877302276e-05, + "loss": 2.9515, + "step": 1785500 + }, + { + "epoch": 0.5552041058991214, + "grad_norm": 9.68159294128418, + "learning_rate": 4.0746598235014646e-05, + "loss": 2.98, + "step": 1786000 + }, + { + "epoch": 0.5553595381796083, + "grad_norm": 5.879319190979004, + "learning_rate": 4.074400769700653e-05, + "loss": 3.0027, + "step": 1786500 + }, + { + "epoch": 0.5555149704600951, + "grad_norm": 11.033047676086426, + "learning_rate": 4.074141715899842e-05, + "loss": 2.9604, + "step": 1787000 + }, + { + "epoch": 0.555670402740582, + "grad_norm": 7.736359596252441, + "learning_rate": 4.07388266209903e-05, + "loss": 2.9951, + "step": 1787500 + }, + { + "epoch": 0.5558258350210689, + "grad_norm": 9.790092468261719, + "learning_rate": 4.073623608298219e-05, + "loss": 2.9954, + "step": 1788000 + }, + { + "epoch": 0.5559812673015557, + "grad_norm": 8.189887046813965, + "learning_rate": 4.0733645544974075e-05, + "loss": 2.9496, + "step": 1788500 + }, + { + "epoch": 0.5561366995820426, + "grad_norm": 6.198785305023193, + "learning_rate": 4.0731055006965956e-05, + "loss": 2.9803, + "step": 1789000 + }, + { + "epoch": 0.5562921318625295, + "grad_norm": 8.13416862487793, + "learning_rate": 4.072846446895784e-05, + "loss": 2.9765, + "step": 1789500 + }, + { + "epoch": 0.5564475641430163, + "grad_norm": 9.038100242614746, + "learning_rate": 4.072587393094973e-05, + "loss": 2.9664, + "step": 1790000 + }, + { + "epoch": 0.5566029964235032, + "grad_norm": 9.358428955078125, + "learning_rate": 4.072328339294161e-05, + "loss": 2.943, + "step": 1790500 + }, + { + "epoch": 0.5567584287039901, + "grad_norm": 7.460196018218994, + "learning_rate": 4.0720692854933504e-05, + "loss": 2.9928, + "step": 1791000 + }, + { + "epoch": 0.5569138609844769, + "grad_norm": 7.983884811401367, + "learning_rate": 4.071810231692539e-05, + "loss": 2.9738, + "step": 1791500 + }, + { + "epoch": 0.5570692932649639, + "grad_norm": 8.227375030517578, + "learning_rate": 4.071551177891727e-05, + "loss": 2.9715, + "step": 1792000 + }, + { + "epoch": 0.5572247255454508, + "grad_norm": 9.455364227294922, + "learning_rate": 4.071292124090916e-05, + "loss": 2.9774, + "step": 1792500 + }, + { + "epoch": 0.5573801578259376, + "grad_norm": 7.9521989822387695, + "learning_rate": 4.071033070290104e-05, + "loss": 2.9889, + "step": 1793000 + }, + { + "epoch": 0.5575355901064245, + "grad_norm": 7.71746301651001, + "learning_rate": 4.0707740164892926e-05, + "loss": 3.0109, + "step": 1793500 + }, + { + "epoch": 0.5576910223869114, + "grad_norm": 8.741413116455078, + "learning_rate": 4.0705149626884814e-05, + "loss": 2.9592, + "step": 1794000 + }, + { + "epoch": 0.5578464546673982, + "grad_norm": 9.760941505432129, + "learning_rate": 4.0702559088876694e-05, + "loss": 2.9699, + "step": 1794500 + }, + { + "epoch": 0.5580018869478851, + "grad_norm": 7.640007972717285, + "learning_rate": 4.069996855086858e-05, + "loss": 2.9418, + "step": 1795000 + }, + { + "epoch": 0.558157319228372, + "grad_norm": 11.275463104248047, + "learning_rate": 4.069737801286047e-05, + "loss": 3.0012, + "step": 1795500 + }, + { + "epoch": 0.5583127515088588, + "grad_norm": 7.378161430358887, + "learning_rate": 4.0694787474852355e-05, + "loss": 2.9569, + "step": 1796000 + }, + { + "epoch": 0.5584681837893457, + "grad_norm": 8.963079452514648, + "learning_rate": 4.069219693684424e-05, + "loss": 3.048, + "step": 1796500 + }, + { + "epoch": 0.5586236160698326, + "grad_norm": 8.206636428833008, + "learning_rate": 4.068960639883613e-05, + "loss": 2.973, + "step": 1797000 + }, + { + "epoch": 0.5587790483503194, + "grad_norm": 9.106537818908691, + "learning_rate": 4.068701586082801e-05, + "loss": 2.9597, + "step": 1797500 + }, + { + "epoch": 0.5589344806308064, + "grad_norm": 7.274282455444336, + "learning_rate": 4.06844253228199e-05, + "loss": 2.9889, + "step": 1798000 + }, + { + "epoch": 0.5590899129112933, + "grad_norm": 6.296745300292969, + "learning_rate": 4.068183478481178e-05, + "loss": 3.0218, + "step": 1798500 + }, + { + "epoch": 0.5592453451917802, + "grad_norm": 7.7121171951293945, + "learning_rate": 4.0679244246803665e-05, + "loss": 2.9782, + "step": 1799000 + }, + { + "epoch": 0.559400777472267, + "grad_norm": 9.100218772888184, + "learning_rate": 4.067665370879555e-05, + "loss": 2.9625, + "step": 1799500 + }, + { + "epoch": 0.5595562097527539, + "grad_norm": 8.95223331451416, + "learning_rate": 4.067406317078743e-05, + "loss": 2.9653, + "step": 1800000 + }, + { + "epoch": 0.5597116420332408, + "grad_norm": 9.045318603515625, + "learning_rate": 4.067147263277932e-05, + "loss": 3.0003, + "step": 1800500 + }, + { + "epoch": 0.5598670743137276, + "grad_norm": 8.278465270996094, + "learning_rate": 4.066888209477121e-05, + "loss": 2.9717, + "step": 1801000 + }, + { + "epoch": 0.5600225065942145, + "grad_norm": 7.790107250213623, + "learning_rate": 4.0666291556763094e-05, + "loss": 2.9863, + "step": 1801500 + }, + { + "epoch": 0.5601779388747014, + "grad_norm": 9.125266075134277, + "learning_rate": 4.066370101875498e-05, + "loss": 2.9346, + "step": 1802000 + }, + { + "epoch": 0.5603333711551882, + "grad_norm": 8.743098258972168, + "learning_rate": 4.066111048074687e-05, + "loss": 2.9724, + "step": 1802500 + }, + { + "epoch": 0.5604888034356751, + "grad_norm": 8.708690643310547, + "learning_rate": 4.065851994273875e-05, + "loss": 2.9674, + "step": 1803000 + }, + { + "epoch": 0.560644235716162, + "grad_norm": 13.365150451660156, + "learning_rate": 4.0655929404730636e-05, + "loss": 3.0015, + "step": 1803500 + }, + { + "epoch": 0.5607996679966489, + "grad_norm": 9.811936378479004, + "learning_rate": 4.0653338866722516e-05, + "loss": 2.9972, + "step": 1804000 + }, + { + "epoch": 0.5609551002771358, + "grad_norm": 6.9181365966796875, + "learning_rate": 4.06507483287144e-05, + "loss": 2.979, + "step": 1804500 + }, + { + "epoch": 0.5611105325576227, + "grad_norm": 5.648136615753174, + "learning_rate": 4.064815779070629e-05, + "loss": 2.9524, + "step": 1805000 + }, + { + "epoch": 0.5612659648381095, + "grad_norm": 9.805440902709961, + "learning_rate": 4.064556725269818e-05, + "loss": 3.0194, + "step": 1805500 + }, + { + "epoch": 0.5614213971185964, + "grad_norm": 8.6397705078125, + "learning_rate": 4.0642976714690065e-05, + "loss": 2.9351, + "step": 1806000 + }, + { + "epoch": 0.5615768293990833, + "grad_norm": 8.504291534423828, + "learning_rate": 4.064038617668195e-05, + "loss": 2.9782, + "step": 1806500 + }, + { + "epoch": 0.5617322616795701, + "grad_norm": 8.756600379943848, + "learning_rate": 4.063779563867383e-05, + "loss": 2.9337, + "step": 1807000 + }, + { + "epoch": 0.561887693960057, + "grad_norm": 8.749723434448242, + "learning_rate": 4.063520510066572e-05, + "loss": 3.0088, + "step": 1807500 + }, + { + "epoch": 0.5620431262405439, + "grad_norm": 6.399398326873779, + "learning_rate": 4.0632614562657606e-05, + "loss": 2.9808, + "step": 1808000 + }, + { + "epoch": 0.5621985585210307, + "grad_norm": 9.195138931274414, + "learning_rate": 4.063002402464949e-05, + "loss": 3.0187, + "step": 1808500 + }, + { + "epoch": 0.5623539908015176, + "grad_norm": 553.5697631835938, + "learning_rate": 4.0627433486641374e-05, + "loss": 2.9717, + "step": 1809000 + }, + { + "epoch": 0.5625094230820045, + "grad_norm": 10.56895637512207, + "learning_rate": 4.062484294863326e-05, + "loss": 2.9983, + "step": 1809500 + }, + { + "epoch": 0.5626648553624914, + "grad_norm": 11.944775581359863, + "learning_rate": 4.062225241062514e-05, + "loss": 2.9455, + "step": 1810000 + }, + { + "epoch": 0.5628202876429783, + "grad_norm": 8.999252319335938, + "learning_rate": 4.061966187261703e-05, + "loss": 2.9739, + "step": 1810500 + }, + { + "epoch": 0.5629757199234652, + "grad_norm": 8.297378540039062, + "learning_rate": 4.0617071334608916e-05, + "loss": 2.9805, + "step": 1811000 + }, + { + "epoch": 0.563131152203952, + "grad_norm": 7.0784711837768555, + "learning_rate": 4.06144807966008e-05, + "loss": 2.997, + "step": 1811500 + }, + { + "epoch": 0.5632865844844389, + "grad_norm": 6.944129467010498, + "learning_rate": 4.061189025859269e-05, + "loss": 2.9811, + "step": 1812000 + }, + { + "epoch": 0.5634420167649258, + "grad_norm": 6.3796234130859375, + "learning_rate": 4.060929972058457e-05, + "loss": 3.0081, + "step": 1812500 + }, + { + "epoch": 0.5635974490454126, + "grad_norm": 9.199127197265625, + "learning_rate": 4.060670918257646e-05, + "loss": 2.926, + "step": 1813000 + }, + { + "epoch": 0.5637528813258995, + "grad_norm": 9.952258110046387, + "learning_rate": 4.0604118644568345e-05, + "loss": 2.9913, + "step": 1813500 + }, + { + "epoch": 0.5639083136063864, + "grad_norm": 7.859336853027344, + "learning_rate": 4.0601528106560225e-05, + "loss": 3.0599, + "step": 1814000 + }, + { + "epoch": 0.5640637458868732, + "grad_norm": 6.636134147644043, + "learning_rate": 4.059893756855211e-05, + "loss": 2.9367, + "step": 1814500 + }, + { + "epoch": 0.5642191781673601, + "grad_norm": 7.893278121948242, + "learning_rate": 4.0596347030544e-05, + "loss": 2.9647, + "step": 1815000 + }, + { + "epoch": 0.564374610447847, + "grad_norm": 10.270133018493652, + "learning_rate": 4.0593756492535887e-05, + "loss": 2.9752, + "step": 1815500 + }, + { + "epoch": 0.564530042728334, + "grad_norm": 8.88795280456543, + "learning_rate": 4.0591165954527774e-05, + "loss": 2.9398, + "step": 1816000 + }, + { + "epoch": 0.5646854750088208, + "grad_norm": 8.47761344909668, + "learning_rate": 4.0588575416519654e-05, + "loss": 3.001, + "step": 1816500 + }, + { + "epoch": 0.5648409072893077, + "grad_norm": 11.649569511413574, + "learning_rate": 4.058598487851154e-05, + "loss": 2.9309, + "step": 1817000 + }, + { + "epoch": 0.5649963395697946, + "grad_norm": 7.762417316436768, + "learning_rate": 4.058339434050343e-05, + "loss": 2.9851, + "step": 1817500 + }, + { + "epoch": 0.5651517718502814, + "grad_norm": 8.838757514953613, + "learning_rate": 4.058080380249531e-05, + "loss": 2.9537, + "step": 1818000 + }, + { + "epoch": 0.5653072041307683, + "grad_norm": 11.406360626220703, + "learning_rate": 4.0578213264487196e-05, + "loss": 2.956, + "step": 1818500 + }, + { + "epoch": 0.5654626364112552, + "grad_norm": 8.657159805297852, + "learning_rate": 4.057562272647908e-05, + "loss": 2.9759, + "step": 1819000 + }, + { + "epoch": 0.565618068691742, + "grad_norm": 7.720760822296143, + "learning_rate": 4.0573032188470963e-05, + "loss": 2.9519, + "step": 1819500 + }, + { + "epoch": 0.5657735009722289, + "grad_norm": 7.7890424728393555, + "learning_rate": 4.057044165046285e-05, + "loss": 2.9572, + "step": 1820000 + }, + { + "epoch": 0.5659289332527158, + "grad_norm": 8.606010437011719, + "learning_rate": 4.056785111245474e-05, + "loss": 2.965, + "step": 1820500 + }, + { + "epoch": 0.5660843655332026, + "grad_norm": 9.60297966003418, + "learning_rate": 4.0565260574446625e-05, + "loss": 2.982, + "step": 1821000 + }, + { + "epoch": 0.5662397978136895, + "grad_norm": 9.876914978027344, + "learning_rate": 4.056267003643851e-05, + "loss": 2.9249, + "step": 1821500 + }, + { + "epoch": 0.5663952300941765, + "grad_norm": 10.794466972351074, + "learning_rate": 4.056007949843039e-05, + "loss": 2.9744, + "step": 1822000 + }, + { + "epoch": 0.5665506623746633, + "grad_norm": 7.434024333953857, + "learning_rate": 4.055748896042228e-05, + "loss": 2.9528, + "step": 1822500 + }, + { + "epoch": 0.5667060946551502, + "grad_norm": 7.939813137054443, + "learning_rate": 4.055489842241417e-05, + "loss": 2.9173, + "step": 1823000 + }, + { + "epoch": 0.5668615269356371, + "grad_norm": 8.710811614990234, + "learning_rate": 4.055230788440605e-05, + "loss": 2.9536, + "step": 1823500 + }, + { + "epoch": 0.5670169592161239, + "grad_norm": 7.437973499298096, + "learning_rate": 4.0549717346397934e-05, + "loss": 2.9709, + "step": 1824000 + }, + { + "epoch": 0.5671723914966108, + "grad_norm": 6.080031394958496, + "learning_rate": 4.054712680838982e-05, + "loss": 2.9471, + "step": 1824500 + }, + { + "epoch": 0.5673278237770977, + "grad_norm": 8.297959327697754, + "learning_rate": 4.054453627038171e-05, + "loss": 2.9731, + "step": 1825000 + }, + { + "epoch": 0.5674832560575845, + "grad_norm": 7.242488384246826, + "learning_rate": 4.0541945732373596e-05, + "loss": 2.9692, + "step": 1825500 + }, + { + "epoch": 0.5676386883380714, + "grad_norm": 11.920766830444336, + "learning_rate": 4.053935519436548e-05, + "loss": 2.9707, + "step": 1826000 + }, + { + "epoch": 0.5677941206185583, + "grad_norm": 7.276578426361084, + "learning_rate": 4.053676465635736e-05, + "loss": 2.9789, + "step": 1826500 + }, + { + "epoch": 0.5679495528990451, + "grad_norm": 9.723198890686035, + "learning_rate": 4.053417411834925e-05, + "loss": 2.9595, + "step": 1827000 + }, + { + "epoch": 0.568104985179532, + "grad_norm": 8.747845649719238, + "learning_rate": 4.053158358034114e-05, + "loss": 3.0102, + "step": 1827500 + }, + { + "epoch": 0.568260417460019, + "grad_norm": 10.035425186157227, + "learning_rate": 4.052899304233302e-05, + "loss": 2.977, + "step": 1828000 + }, + { + "epoch": 0.5684158497405059, + "grad_norm": 7.820067882537842, + "learning_rate": 4.0526402504324905e-05, + "loss": 2.975, + "step": 1828500 + }, + { + "epoch": 0.5685712820209927, + "grad_norm": 8.203004837036133, + "learning_rate": 4.0523811966316786e-05, + "loss": 2.9601, + "step": 1829000 + }, + { + "epoch": 0.5687267143014796, + "grad_norm": 10.676506042480469, + "learning_rate": 4.052122142830867e-05, + "loss": 2.9595, + "step": 1829500 + }, + { + "epoch": 0.5688821465819665, + "grad_norm": 9.418689727783203, + "learning_rate": 4.051863089030056e-05, + "loss": 2.9809, + "step": 1830000 + }, + { + "epoch": 0.5690375788624533, + "grad_norm": 7.232340335845947, + "learning_rate": 4.051604035229245e-05, + "loss": 3.021, + "step": 1830500 + }, + { + "epoch": 0.5691930111429402, + "grad_norm": 11.292695045471191, + "learning_rate": 4.0513449814284334e-05, + "loss": 2.9658, + "step": 1831000 + }, + { + "epoch": 0.569348443423427, + "grad_norm": 8.62258243560791, + "learning_rate": 4.051085927627622e-05, + "loss": 2.967, + "step": 1831500 + }, + { + "epoch": 0.5695038757039139, + "grad_norm": 6.402281284332275, + "learning_rate": 4.05082687382681e-05, + "loss": 3.0033, + "step": 1832000 + }, + { + "epoch": 0.5696593079844008, + "grad_norm": 9.285646438598633, + "learning_rate": 4.050567820025999e-05, + "loss": 2.9237, + "step": 1832500 + }, + { + "epoch": 0.5698147402648877, + "grad_norm": 7.076711177825928, + "learning_rate": 4.0503087662251876e-05, + "loss": 2.9782, + "step": 1833000 + }, + { + "epoch": 0.5699701725453745, + "grad_norm": 9.919183731079102, + "learning_rate": 4.0500497124243756e-05, + "loss": 2.9751, + "step": 1833500 + }, + { + "epoch": 0.5701256048258615, + "grad_norm": 6.930329322814941, + "learning_rate": 4.0497906586235643e-05, + "loss": 2.9144, + "step": 1834000 + }, + { + "epoch": 0.5702810371063484, + "grad_norm": 8.696823120117188, + "learning_rate": 4.049531604822753e-05, + "loss": 2.9976, + "step": 1834500 + }, + { + "epoch": 0.5704364693868352, + "grad_norm": 9.599024772644043, + "learning_rate": 4.049272551021942e-05, + "loss": 2.9727, + "step": 1835000 + }, + { + "epoch": 0.5705919016673221, + "grad_norm": 7.526211738586426, + "learning_rate": 4.0490134972211305e-05, + "loss": 3.0215, + "step": 1835500 + }, + { + "epoch": 0.570747333947809, + "grad_norm": 8.119447708129883, + "learning_rate": 4.0487544434203185e-05, + "loss": 2.9841, + "step": 1836000 + }, + { + "epoch": 0.5709027662282958, + "grad_norm": 8.163471221923828, + "learning_rate": 4.048495389619507e-05, + "loss": 2.9662, + "step": 1836500 + }, + { + "epoch": 0.5710581985087827, + "grad_norm": 22.32324981689453, + "learning_rate": 4.048236335818696e-05, + "loss": 2.9618, + "step": 1837000 + }, + { + "epoch": 0.5712136307892696, + "grad_norm": 6.546942710876465, + "learning_rate": 4.047977282017884e-05, + "loss": 2.9984, + "step": 1837500 + }, + { + "epoch": 0.5713690630697564, + "grad_norm": 10.746034622192383, + "learning_rate": 4.047718228217073e-05, + "loss": 2.9818, + "step": 1838000 + }, + { + "epoch": 0.5715244953502433, + "grad_norm": 9.502684593200684, + "learning_rate": 4.0474591744162614e-05, + "loss": 2.9264, + "step": 1838500 + }, + { + "epoch": 0.5716799276307302, + "grad_norm": 12.056211471557617, + "learning_rate": 4.0472001206154495e-05, + "loss": 2.9218, + "step": 1839000 + }, + { + "epoch": 0.571835359911217, + "grad_norm": 7.549818992614746, + "learning_rate": 4.046941066814638e-05, + "loss": 2.9575, + "step": 1839500 + }, + { + "epoch": 0.571990792191704, + "grad_norm": 9.698902130126953, + "learning_rate": 4.046682013013827e-05, + "loss": 2.9725, + "step": 1840000 + }, + { + "epoch": 0.5721462244721909, + "grad_norm": 7.523335933685303, + "learning_rate": 4.0464229592130156e-05, + "loss": 2.945, + "step": 1840500 + }, + { + "epoch": 0.5723016567526777, + "grad_norm": 9.480661392211914, + "learning_rate": 4.046163905412204e-05, + "loss": 3.0041, + "step": 1841000 + }, + { + "epoch": 0.5724570890331646, + "grad_norm": 8.363966941833496, + "learning_rate": 4.0459048516113924e-05, + "loss": 2.973, + "step": 1841500 + }, + { + "epoch": 0.5726125213136515, + "grad_norm": 8.149331092834473, + "learning_rate": 4.045645797810581e-05, + "loss": 2.9491, + "step": 1842000 + }, + { + "epoch": 0.5727679535941383, + "grad_norm": 9.263171195983887, + "learning_rate": 4.04538674400977e-05, + "loss": 2.9773, + "step": 1842500 + }, + { + "epoch": 0.5729233858746252, + "grad_norm": 8.71237564086914, + "learning_rate": 4.045127690208958e-05, + "loss": 2.9386, + "step": 1843000 + }, + { + "epoch": 0.5730788181551121, + "grad_norm": 15.920612335205078, + "learning_rate": 4.0448686364081466e-05, + "loss": 2.9843, + "step": 1843500 + }, + { + "epoch": 0.573234250435599, + "grad_norm": 20.945199966430664, + "learning_rate": 4.044609582607335e-05, + "loss": 2.9447, + "step": 1844000 + }, + { + "epoch": 0.5733896827160858, + "grad_norm": 5.5081562995910645, + "learning_rate": 4.044350528806524e-05, + "loss": 2.9351, + "step": 1844500 + }, + { + "epoch": 0.5735451149965727, + "grad_norm": 8.664064407348633, + "learning_rate": 4.044091475005713e-05, + "loss": 2.9582, + "step": 1845000 + }, + { + "epoch": 0.5737005472770595, + "grad_norm": 10.641637802124023, + "learning_rate": 4.0438324212049014e-05, + "loss": 2.9604, + "step": 1845500 + }, + { + "epoch": 0.5738559795575465, + "grad_norm": 7.532942771911621, + "learning_rate": 4.0435733674040895e-05, + "loss": 2.9673, + "step": 1846000 + }, + { + "epoch": 0.5740114118380334, + "grad_norm": 7.029507160186768, + "learning_rate": 4.043314313603278e-05, + "loss": 2.9702, + "step": 1846500 + }, + { + "epoch": 0.5741668441185203, + "grad_norm": 10.335710525512695, + "learning_rate": 4.043055259802466e-05, + "loss": 2.9964, + "step": 1847000 + }, + { + "epoch": 0.5743222763990071, + "grad_norm": 7.379754543304443, + "learning_rate": 4.042796206001655e-05, + "loss": 2.9578, + "step": 1847500 + }, + { + "epoch": 0.574477708679494, + "grad_norm": 8.397046089172363, + "learning_rate": 4.0425371522008436e-05, + "loss": 2.9903, + "step": 1848000 + }, + { + "epoch": 0.5746331409599809, + "grad_norm": 8.403599739074707, + "learning_rate": 4.042278098400032e-05, + "loss": 3.0074, + "step": 1848500 + }, + { + "epoch": 0.5747885732404677, + "grad_norm": 9.415999412536621, + "learning_rate": 4.0420190445992204e-05, + "loss": 2.95, + "step": 1849000 + }, + { + "epoch": 0.5749440055209546, + "grad_norm": 12.669332504272461, + "learning_rate": 4.041759990798409e-05, + "loss": 2.9387, + "step": 1849500 + }, + { + "epoch": 0.5750994378014415, + "grad_norm": 19.689922332763672, + "learning_rate": 4.041500936997598e-05, + "loss": 2.9613, + "step": 1850000 + }, + { + "epoch": 0.5752548700819283, + "grad_norm": 6.747448444366455, + "learning_rate": 4.0412418831967865e-05, + "loss": 2.975, + "step": 1850500 + }, + { + "epoch": 0.5754103023624152, + "grad_norm": 12.214737892150879, + "learning_rate": 4.040982829395975e-05, + "loss": 2.9498, + "step": 1851000 + }, + { + "epoch": 0.5755657346429021, + "grad_norm": 9.60844612121582, + "learning_rate": 4.040723775595163e-05, + "loss": 2.9903, + "step": 1851500 + }, + { + "epoch": 0.575721166923389, + "grad_norm": 8.551641464233398, + "learning_rate": 4.040464721794352e-05, + "loss": 2.9875, + "step": 1852000 + }, + { + "epoch": 0.5758765992038759, + "grad_norm": 7.253950595855713, + "learning_rate": 4.04020566799354e-05, + "loss": 2.9915, + "step": 1852500 + }, + { + "epoch": 0.5760320314843628, + "grad_norm": 8.437673568725586, + "learning_rate": 4.039946614192729e-05, + "loss": 2.9869, + "step": 1853000 + }, + { + "epoch": 0.5761874637648496, + "grad_norm": 6.721896171569824, + "learning_rate": 4.0396875603919175e-05, + "loss": 3.0119, + "step": 1853500 + }, + { + "epoch": 0.5763428960453365, + "grad_norm": 6.992054462432861, + "learning_rate": 4.0394285065911055e-05, + "loss": 2.9563, + "step": 1854000 + }, + { + "epoch": 0.5764983283258234, + "grad_norm": 8.146021842956543, + "learning_rate": 4.039169452790295e-05, + "loss": 2.9359, + "step": 1854500 + }, + { + "epoch": 0.5766537606063102, + "grad_norm": 10.619091987609863, + "learning_rate": 4.0389103989894836e-05, + "loss": 2.9734, + "step": 1855000 + }, + { + "epoch": 0.5768091928867971, + "grad_norm": 7.071630001068115, + "learning_rate": 4.0386513451886717e-05, + "loss": 2.9901, + "step": 1855500 + }, + { + "epoch": 0.576964625167284, + "grad_norm": 8.196293830871582, + "learning_rate": 4.0383922913878604e-05, + "loss": 2.9869, + "step": 1856000 + }, + { + "epoch": 0.5771200574477708, + "grad_norm": 9.793081283569336, + "learning_rate": 4.038133237587049e-05, + "loss": 2.9979, + "step": 1856500 + }, + { + "epoch": 0.5772754897282577, + "grad_norm": 27.13294219970703, + "learning_rate": 4.037874183786237e-05, + "loss": 2.9686, + "step": 1857000 + }, + { + "epoch": 0.5774309220087446, + "grad_norm": 6.699514865875244, + "learning_rate": 4.037615129985426e-05, + "loss": 2.955, + "step": 1857500 + }, + { + "epoch": 0.5775863542892316, + "grad_norm": 13.915959358215332, + "learning_rate": 4.037356076184614e-05, + "loss": 2.9523, + "step": 1858000 + }, + { + "epoch": 0.5777417865697184, + "grad_norm": 15.912198066711426, + "learning_rate": 4.0370970223838026e-05, + "loss": 2.9297, + "step": 1858500 + }, + { + "epoch": 0.5778972188502053, + "grad_norm": 8.41680908203125, + "learning_rate": 4.036837968582991e-05, + "loss": 2.9822, + "step": 1859000 + }, + { + "epoch": 0.5780526511306922, + "grad_norm": 11.847681045532227, + "learning_rate": 4.03657891478218e-05, + "loss": 2.972, + "step": 1859500 + }, + { + "epoch": 0.578208083411179, + "grad_norm": 14.017334938049316, + "learning_rate": 4.036319860981369e-05, + "loss": 2.9283, + "step": 1860000 + }, + { + "epoch": 0.5783635156916659, + "grad_norm": 8.800368309020996, + "learning_rate": 4.0360608071805575e-05, + "loss": 2.9368, + "step": 1860500 + }, + { + "epoch": 0.5785189479721528, + "grad_norm": 18.510204315185547, + "learning_rate": 4.0358017533797455e-05, + "loss": 2.9565, + "step": 1861000 + }, + { + "epoch": 0.5786743802526396, + "grad_norm": 7.800788879394531, + "learning_rate": 4.035542699578934e-05, + "loss": 2.9765, + "step": 1861500 + }, + { + "epoch": 0.5788298125331265, + "grad_norm": 7.36224889755249, + "learning_rate": 4.035283645778123e-05, + "loss": 2.959, + "step": 1862000 + }, + { + "epoch": 0.5789852448136134, + "grad_norm": 8.511129379272461, + "learning_rate": 4.035024591977311e-05, + "loss": 2.9898, + "step": 1862500 + }, + { + "epoch": 0.5791406770941002, + "grad_norm": 9.401568412780762, + "learning_rate": 4.0347655381765e-05, + "loss": 2.949, + "step": 1863000 + }, + { + "epoch": 0.5792961093745871, + "grad_norm": 13.05501651763916, + "learning_rate": 4.0345064843756884e-05, + "loss": 2.9889, + "step": 1863500 + }, + { + "epoch": 0.579451541655074, + "grad_norm": 8.094270706176758, + "learning_rate": 4.0342474305748764e-05, + "loss": 3.0023, + "step": 1864000 + }, + { + "epoch": 0.5796069739355609, + "grad_norm": 7.260662078857422, + "learning_rate": 4.033988376774066e-05, + "loss": 2.9413, + "step": 1864500 + }, + { + "epoch": 0.5797624062160478, + "grad_norm": 8.983668327331543, + "learning_rate": 4.033729322973254e-05, + "loss": 2.9699, + "step": 1865000 + }, + { + "epoch": 0.5799178384965347, + "grad_norm": 7.623673439025879, + "learning_rate": 4.0334702691724426e-05, + "loss": 2.9314, + "step": 1865500 + }, + { + "epoch": 0.5800732707770215, + "grad_norm": 8.807304382324219, + "learning_rate": 4.033211215371631e-05, + "loss": 2.989, + "step": 1866000 + }, + { + "epoch": 0.5802287030575084, + "grad_norm": 7.751481533050537, + "learning_rate": 4.032952161570819e-05, + "loss": 2.9734, + "step": 1866500 + }, + { + "epoch": 0.5803841353379953, + "grad_norm": 11.736603736877441, + "learning_rate": 4.032693107770008e-05, + "loss": 2.9558, + "step": 1867000 + }, + { + "epoch": 0.5805395676184821, + "grad_norm": 6.927455425262451, + "learning_rate": 4.032434053969197e-05, + "loss": 2.9293, + "step": 1867500 + }, + { + "epoch": 0.580694999898969, + "grad_norm": 9.90015697479248, + "learning_rate": 4.032175000168385e-05, + "loss": 2.9619, + "step": 1868000 + }, + { + "epoch": 0.5808504321794559, + "grad_norm": 8.62161636352539, + "learning_rate": 4.0319159463675735e-05, + "loss": 2.9638, + "step": 1868500 + }, + { + "epoch": 0.5810058644599427, + "grad_norm": 9.4489107131958, + "learning_rate": 4.031656892566762e-05, + "loss": 2.9509, + "step": 1869000 + }, + { + "epoch": 0.5811612967404296, + "grad_norm": 7.754022121429443, + "learning_rate": 4.031397838765951e-05, + "loss": 2.9987, + "step": 1869500 + }, + { + "epoch": 0.5813167290209165, + "grad_norm": 6.948873519897461, + "learning_rate": 4.0311387849651397e-05, + "loss": 2.9536, + "step": 1870000 + }, + { + "epoch": 0.5814721613014034, + "grad_norm": 11.69247817993164, + "learning_rate": 4.030879731164328e-05, + "loss": 2.9678, + "step": 1870500 + }, + { + "epoch": 0.5816275935818903, + "grad_norm": 9.565226554870605, + "learning_rate": 4.0306206773635164e-05, + "loss": 2.9685, + "step": 1871000 + }, + { + "epoch": 0.5817830258623772, + "grad_norm": 8.378715515136719, + "learning_rate": 4.030361623562705e-05, + "loss": 2.9775, + "step": 1871500 + }, + { + "epoch": 0.581938458142864, + "grad_norm": 7.2941694259643555, + "learning_rate": 4.030102569761893e-05, + "loss": 2.9872, + "step": 1872000 + }, + { + "epoch": 0.5820938904233509, + "grad_norm": 10.029539108276367, + "learning_rate": 4.029843515961082e-05, + "loss": 2.9485, + "step": 1872500 + }, + { + "epoch": 0.5822493227038378, + "grad_norm": 8.145179748535156, + "learning_rate": 4.0295844621602706e-05, + "loss": 2.9335, + "step": 1873000 + }, + { + "epoch": 0.5824047549843246, + "grad_norm": 8.47540283203125, + "learning_rate": 4.0293254083594586e-05, + "loss": 2.9637, + "step": 1873500 + }, + { + "epoch": 0.5825601872648115, + "grad_norm": 9.174695014953613, + "learning_rate": 4.0290663545586473e-05, + "loss": 3.0068, + "step": 1874000 + }, + { + "epoch": 0.5827156195452984, + "grad_norm": 8.890761375427246, + "learning_rate": 4.028807300757837e-05, + "loss": 2.9266, + "step": 1874500 + }, + { + "epoch": 0.5828710518257852, + "grad_norm": 8.965580940246582, + "learning_rate": 4.028548246957025e-05, + "loss": 2.9448, + "step": 1875000 + }, + { + "epoch": 0.5830264841062721, + "grad_norm": 7.285812854766846, + "learning_rate": 4.0282891931562135e-05, + "loss": 2.9635, + "step": 1875500 + }, + { + "epoch": 0.583181916386759, + "grad_norm": 9.419124603271484, + "learning_rate": 4.028030139355402e-05, + "loss": 2.9328, + "step": 1876000 + }, + { + "epoch": 0.583337348667246, + "grad_norm": 10.15672492980957, + "learning_rate": 4.02777108555459e-05, + "loss": 2.976, + "step": 1876500 + }, + { + "epoch": 0.5834927809477328, + "grad_norm": 10.397209167480469, + "learning_rate": 4.027512031753779e-05, + "loss": 2.9538, + "step": 1877000 + }, + { + "epoch": 0.5836482132282197, + "grad_norm": 9.736611366271973, + "learning_rate": 4.027252977952967e-05, + "loss": 2.9519, + "step": 1877500 + }, + { + "epoch": 0.5838036455087066, + "grad_norm": 9.442445755004883, + "learning_rate": 4.026993924152156e-05, + "loss": 2.9525, + "step": 1878000 + }, + { + "epoch": 0.5839590777891934, + "grad_norm": 9.484270095825195, + "learning_rate": 4.0267348703513444e-05, + "loss": 2.9891, + "step": 1878500 + }, + { + "epoch": 0.5841145100696803, + "grad_norm": 11.126225471496582, + "learning_rate": 4.026475816550533e-05, + "loss": 3.0046, + "step": 1879000 + }, + { + "epoch": 0.5842699423501672, + "grad_norm": 9.367067337036133, + "learning_rate": 4.026216762749722e-05, + "loss": 2.9533, + "step": 1879500 + }, + { + "epoch": 0.584425374630654, + "grad_norm": 7.541717052459717, + "learning_rate": 4.0259577089489106e-05, + "loss": 3.0021, + "step": 1880000 + }, + { + "epoch": 0.5845808069111409, + "grad_norm": 23.202543258666992, + "learning_rate": 4.0256986551480986e-05, + "loss": 2.9549, + "step": 1880500 + }, + { + "epoch": 0.5847362391916278, + "grad_norm": 25.931636810302734, + "learning_rate": 4.025439601347287e-05, + "loss": 2.9264, + "step": 1881000 + }, + { + "epoch": 0.5848916714721146, + "grad_norm": 6.8624725341796875, + "learning_rate": 4.025180547546476e-05, + "loss": 3.0123, + "step": 1881500 + }, + { + "epoch": 0.5850471037526015, + "grad_norm": 8.813566207885742, + "learning_rate": 4.024921493745664e-05, + "loss": 2.9807, + "step": 1882000 + }, + { + "epoch": 0.5852025360330885, + "grad_norm": 9.52688980102539, + "learning_rate": 4.024662439944853e-05, + "loss": 2.9875, + "step": 1882500 + }, + { + "epoch": 0.5853579683135753, + "grad_norm": 7.527014255523682, + "learning_rate": 4.024403386144041e-05, + "loss": 3.0048, + "step": 1883000 + }, + { + "epoch": 0.5855134005940622, + "grad_norm": 8.01461124420166, + "learning_rate": 4.0241443323432295e-05, + "loss": 3.0107, + "step": 1883500 + }, + { + "epoch": 0.5856688328745491, + "grad_norm": 7.9374518394470215, + "learning_rate": 4.023885278542418e-05, + "loss": 2.9372, + "step": 1884000 + }, + { + "epoch": 0.5858242651550359, + "grad_norm": 8.513270378112793, + "learning_rate": 4.023626224741607e-05, + "loss": 2.9882, + "step": 1884500 + }, + { + "epoch": 0.5859796974355228, + "grad_norm": 9.912753105163574, + "learning_rate": 4.023367170940796e-05, + "loss": 2.9252, + "step": 1885000 + }, + { + "epoch": 0.5861351297160097, + "grad_norm": 16.114286422729492, + "learning_rate": 4.0231081171399844e-05, + "loss": 2.9974, + "step": 1885500 + }, + { + "epoch": 0.5862905619964965, + "grad_norm": 7.383599281311035, + "learning_rate": 4.0228490633391724e-05, + "loss": 2.9365, + "step": 1886000 + }, + { + "epoch": 0.5864459942769834, + "grad_norm": 7.867143154144287, + "learning_rate": 4.022590009538361e-05, + "loss": 2.9162, + "step": 1886500 + }, + { + "epoch": 0.5866014265574703, + "grad_norm": 10.486642837524414, + "learning_rate": 4.02233095573755e-05, + "loss": 2.9404, + "step": 1887000 + }, + { + "epoch": 0.5867568588379571, + "grad_norm": 8.92665958404541, + "learning_rate": 4.022071901936738e-05, + "loss": 2.9385, + "step": 1887500 + }, + { + "epoch": 0.586912291118444, + "grad_norm": 14.1974515914917, + "learning_rate": 4.0218128481359266e-05, + "loss": 2.9718, + "step": 1888000 + }, + { + "epoch": 0.587067723398931, + "grad_norm": 7.259216785430908, + "learning_rate": 4.0215537943351153e-05, + "loss": 2.9514, + "step": 1888500 + }, + { + "epoch": 0.5872231556794179, + "grad_norm": 7.42537260055542, + "learning_rate": 4.021294740534304e-05, + "loss": 2.9531, + "step": 1889000 + }, + { + "epoch": 0.5873785879599047, + "grad_norm": 10.645432472229004, + "learning_rate": 4.021035686733493e-05, + "loss": 2.9424, + "step": 1889500 + }, + { + "epoch": 0.5875340202403916, + "grad_norm": 7.966382026672363, + "learning_rate": 4.020776632932681e-05, + "loss": 3.0121, + "step": 1890000 + }, + { + "epoch": 0.5876894525208785, + "grad_norm": 13.127360343933105, + "learning_rate": 4.0205175791318695e-05, + "loss": 3.0026, + "step": 1890500 + }, + { + "epoch": 0.5878448848013653, + "grad_norm": 7.107528209686279, + "learning_rate": 4.020258525331058e-05, + "loss": 2.9684, + "step": 1891000 + }, + { + "epoch": 0.5880003170818522, + "grad_norm": 7.944069862365723, + "learning_rate": 4.019999471530246e-05, + "loss": 2.9812, + "step": 1891500 + }, + { + "epoch": 0.588155749362339, + "grad_norm": 9.316898345947266, + "learning_rate": 4.019740417729435e-05, + "loss": 2.9694, + "step": 1892000 + }, + { + "epoch": 0.5883111816428259, + "grad_norm": 9.144784927368164, + "learning_rate": 4.019481363928624e-05, + "loss": 2.9763, + "step": 1892500 + }, + { + "epoch": 0.5884666139233128, + "grad_norm": 7.77641487121582, + "learning_rate": 4.019222310127812e-05, + "loss": 2.9612, + "step": 1893000 + }, + { + "epoch": 0.5886220462037997, + "grad_norm": 11.691157341003418, + "learning_rate": 4.0189632563270005e-05, + "loss": 2.9576, + "step": 1893500 + }, + { + "epoch": 0.5887774784842865, + "grad_norm": 7.552149772644043, + "learning_rate": 4.018704202526189e-05, + "loss": 2.9813, + "step": 1894000 + }, + { + "epoch": 0.5889329107647735, + "grad_norm": 11.28922176361084, + "learning_rate": 4.018445148725378e-05, + "loss": 3.0036, + "step": 1894500 + }, + { + "epoch": 0.5890883430452604, + "grad_norm": 9.780399322509766, + "learning_rate": 4.0181860949245666e-05, + "loss": 2.926, + "step": 1895000 + }, + { + "epoch": 0.5892437753257472, + "grad_norm": 8.250375747680664, + "learning_rate": 4.0179270411237547e-05, + "loss": 2.9398, + "step": 1895500 + }, + { + "epoch": 0.5893992076062341, + "grad_norm": 9.242671012878418, + "learning_rate": 4.0176679873229434e-05, + "loss": 2.9414, + "step": 1896000 + }, + { + "epoch": 0.589554639886721, + "grad_norm": 5.900136470794678, + "learning_rate": 4.017408933522132e-05, + "loss": 2.9797, + "step": 1896500 + }, + { + "epoch": 0.5897100721672078, + "grad_norm": 6.7396240234375, + "learning_rate": 4.01714987972132e-05, + "loss": 2.9637, + "step": 1897000 + }, + { + "epoch": 0.5898655044476947, + "grad_norm": 10.836135864257812, + "learning_rate": 4.016890825920509e-05, + "loss": 2.9797, + "step": 1897500 + }, + { + "epoch": 0.5900209367281816, + "grad_norm": 8.48257064819336, + "learning_rate": 4.0166317721196976e-05, + "loss": 2.9298, + "step": 1898000 + }, + { + "epoch": 0.5901763690086684, + "grad_norm": 24.910869598388672, + "learning_rate": 4.016372718318886e-05, + "loss": 3.0207, + "step": 1898500 + }, + { + "epoch": 0.5903318012891553, + "grad_norm": 7.356391906738281, + "learning_rate": 4.016113664518075e-05, + "loss": 2.9619, + "step": 1899000 + }, + { + "epoch": 0.5904872335696422, + "grad_norm": 7.696187973022461, + "learning_rate": 4.015854610717264e-05, + "loss": 2.9656, + "step": 1899500 + }, + { + "epoch": 0.590642665850129, + "grad_norm": 8.997997283935547, + "learning_rate": 4.015595556916452e-05, + "loss": 2.9448, + "step": 1900000 + }, + { + "epoch": 0.590798098130616, + "grad_norm": 12.416702270507812, + "learning_rate": 4.0153365031156404e-05, + "loss": 2.9186, + "step": 1900500 + }, + { + "epoch": 0.5909535304111029, + "grad_norm": 7.650506973266602, + "learning_rate": 4.0150774493148285e-05, + "loss": 2.965, + "step": 1901000 + }, + { + "epoch": 0.5911089626915897, + "grad_norm": 23.074501037597656, + "learning_rate": 4.014818395514017e-05, + "loss": 2.9289, + "step": 1901500 + }, + { + "epoch": 0.5912643949720766, + "grad_norm": 8.032337188720703, + "learning_rate": 4.014559341713206e-05, + "loss": 2.9495, + "step": 1902000 + }, + { + "epoch": 0.5914198272525635, + "grad_norm": 7.7092695236206055, + "learning_rate": 4.014300287912394e-05, + "loss": 2.955, + "step": 1902500 + }, + { + "epoch": 0.5915752595330503, + "grad_norm": 10.16049575805664, + "learning_rate": 4.014041234111583e-05, + "loss": 2.9564, + "step": 1903000 + }, + { + "epoch": 0.5917306918135372, + "grad_norm": 7.143554210662842, + "learning_rate": 4.0137821803107714e-05, + "loss": 2.9461, + "step": 1903500 + }, + { + "epoch": 0.5918861240940241, + "grad_norm": 7.932815074920654, + "learning_rate": 4.01352312650996e-05, + "loss": 2.94, + "step": 1904000 + }, + { + "epoch": 0.592041556374511, + "grad_norm": 6.945794105529785, + "learning_rate": 4.013264072709149e-05, + "loss": 2.9638, + "step": 1904500 + }, + { + "epoch": 0.5921969886549978, + "grad_norm": 9.162796974182129, + "learning_rate": 4.0130050189083375e-05, + "loss": 2.9343, + "step": 1905000 + }, + { + "epoch": 0.5923524209354847, + "grad_norm": 7.956560134887695, + "learning_rate": 4.0127459651075256e-05, + "loss": 2.9498, + "step": 1905500 + }, + { + "epoch": 0.5925078532159715, + "grad_norm": 9.347311019897461, + "learning_rate": 4.012486911306714e-05, + "loss": 2.9671, + "step": 1906000 + }, + { + "epoch": 0.5926632854964585, + "grad_norm": 9.250600814819336, + "learning_rate": 4.012227857505902e-05, + "loss": 2.9291, + "step": 1906500 + }, + { + "epoch": 0.5928187177769454, + "grad_norm": 7.473616600036621, + "learning_rate": 4.011968803705091e-05, + "loss": 2.9119, + "step": 1907000 + }, + { + "epoch": 0.5929741500574323, + "grad_norm": 8.638497352600098, + "learning_rate": 4.01170974990428e-05, + "loss": 3.0238, + "step": 1907500 + }, + { + "epoch": 0.5931295823379191, + "grad_norm": 8.279069900512695, + "learning_rate": 4.0114506961034685e-05, + "loss": 2.973, + "step": 1908000 + }, + { + "epoch": 0.593285014618406, + "grad_norm": 6.731867790222168, + "learning_rate": 4.011191642302657e-05, + "loss": 2.9667, + "step": 1908500 + }, + { + "epoch": 0.5934404468988929, + "grad_norm": 6.981318473815918, + "learning_rate": 4.010932588501846e-05, + "loss": 2.9663, + "step": 1909000 + }, + { + "epoch": 0.5935958791793797, + "grad_norm": 10.790255546569824, + "learning_rate": 4.010673534701034e-05, + "loss": 2.9427, + "step": 1909500 + }, + { + "epoch": 0.5937513114598666, + "grad_norm": 8.896047592163086, + "learning_rate": 4.0104144809002227e-05, + "loss": 2.951, + "step": 1910000 + }, + { + "epoch": 0.5939067437403535, + "grad_norm": 7.2096147537231445, + "learning_rate": 4.0101554270994114e-05, + "loss": 2.9886, + "step": 1910500 + }, + { + "epoch": 0.5940621760208403, + "grad_norm": 10.275404930114746, + "learning_rate": 4.0098963732985994e-05, + "loss": 2.9651, + "step": 1911000 + }, + { + "epoch": 0.5942176083013272, + "grad_norm": 6.840910911560059, + "learning_rate": 4.009637319497788e-05, + "loss": 2.9716, + "step": 1911500 + }, + { + "epoch": 0.5943730405818141, + "grad_norm": 11.49067211151123, + "learning_rate": 4.009378265696977e-05, + "loss": 2.932, + "step": 1912000 + }, + { + "epoch": 0.594528472862301, + "grad_norm": 10.550823211669922, + "learning_rate": 4.009119211896165e-05, + "loss": 2.9493, + "step": 1912500 + }, + { + "epoch": 0.5946839051427879, + "grad_norm": 7.6340155601501465, + "learning_rate": 4.0088601580953536e-05, + "loss": 2.9933, + "step": 1913000 + }, + { + "epoch": 0.5948393374232748, + "grad_norm": 8.712211608886719, + "learning_rate": 4.008601104294542e-05, + "loss": 2.9415, + "step": 1913500 + }, + { + "epoch": 0.5949947697037616, + "grad_norm": 7.996584415435791, + "learning_rate": 4.008342050493731e-05, + "loss": 2.9293, + "step": 1914000 + }, + { + "epoch": 0.5951502019842485, + "grad_norm": 9.748420715332031, + "learning_rate": 4.00808299669292e-05, + "loss": 2.964, + "step": 1914500 + }, + { + "epoch": 0.5953056342647354, + "grad_norm": 7.780703067779541, + "learning_rate": 4.007823942892108e-05, + "loss": 2.9599, + "step": 1915000 + }, + { + "epoch": 0.5954610665452222, + "grad_norm": 8.471625328063965, + "learning_rate": 4.0075648890912965e-05, + "loss": 2.9231, + "step": 1915500 + }, + { + "epoch": 0.5956164988257091, + "grad_norm": 9.862652778625488, + "learning_rate": 4.007305835290485e-05, + "loss": 2.9858, + "step": 1916000 + }, + { + "epoch": 0.595771931106196, + "grad_norm": 8.860836029052734, + "learning_rate": 4.007046781489673e-05, + "loss": 2.9777, + "step": 1916500 + }, + { + "epoch": 0.5959273633866828, + "grad_norm": 10.768841743469238, + "learning_rate": 4.006787727688862e-05, + "loss": 2.9836, + "step": 1917000 + }, + { + "epoch": 0.5960827956671697, + "grad_norm": 9.180926322937012, + "learning_rate": 4.006528673888051e-05, + "loss": 2.9817, + "step": 1917500 + }, + { + "epoch": 0.5962382279476566, + "grad_norm": 33.087276458740234, + "learning_rate": 4.0062696200872394e-05, + "loss": 2.9858, + "step": 1918000 + }, + { + "epoch": 0.5963936602281436, + "grad_norm": 10.017367362976074, + "learning_rate": 4.006010566286428e-05, + "loss": 2.9529, + "step": 1918500 + }, + { + "epoch": 0.5965490925086304, + "grad_norm": 9.363588333129883, + "learning_rate": 4.005751512485616e-05, + "loss": 2.9194, + "step": 1919000 + }, + { + "epoch": 0.5967045247891173, + "grad_norm": 9.51067066192627, + "learning_rate": 4.005492458684805e-05, + "loss": 3.0325, + "step": 1919500 + }, + { + "epoch": 0.5968599570696042, + "grad_norm": 8.635669708251953, + "learning_rate": 4.0052334048839936e-05, + "loss": 2.9935, + "step": 1920000 + }, + { + "epoch": 0.597015389350091, + "grad_norm": 8.078288078308105, + "learning_rate": 4.0049743510831816e-05, + "loss": 3.0012, + "step": 1920500 + }, + { + "epoch": 0.5971708216305779, + "grad_norm": 8.586241722106934, + "learning_rate": 4.00471529728237e-05, + "loss": 2.995, + "step": 1921000 + }, + { + "epoch": 0.5973262539110648, + "grad_norm": 7.837130546569824, + "learning_rate": 4.004456243481559e-05, + "loss": 2.9646, + "step": 1921500 + }, + { + "epoch": 0.5974816861915516, + "grad_norm": 8.727289199829102, + "learning_rate": 4.004197189680747e-05, + "loss": 2.9382, + "step": 1922000 + }, + { + "epoch": 0.5976371184720385, + "grad_norm": 8.248546600341797, + "learning_rate": 4.003938135879936e-05, + "loss": 2.9345, + "step": 1922500 + }, + { + "epoch": 0.5977925507525254, + "grad_norm": 9.072442054748535, + "learning_rate": 4.0036790820791245e-05, + "loss": 2.921, + "step": 1923000 + }, + { + "epoch": 0.5979479830330122, + "grad_norm": 7.539675712585449, + "learning_rate": 4.003420028278313e-05, + "loss": 2.9934, + "step": 1923500 + }, + { + "epoch": 0.5981034153134991, + "grad_norm": 9.460856437683105, + "learning_rate": 4.003160974477502e-05, + "loss": 2.916, + "step": 1924000 + }, + { + "epoch": 0.5982588475939861, + "grad_norm": 8.387019157409668, + "learning_rate": 4.00290192067669e-05, + "loss": 2.966, + "step": 1924500 + }, + { + "epoch": 0.5984142798744729, + "grad_norm": 5.6853814125061035, + "learning_rate": 4.002642866875879e-05, + "loss": 2.9768, + "step": 1925000 + }, + { + "epoch": 0.5985697121549598, + "grad_norm": 10.472945213317871, + "learning_rate": 4.0023838130750674e-05, + "loss": 2.9299, + "step": 1925500 + }, + { + "epoch": 0.5987251444354467, + "grad_norm": 10.88846492767334, + "learning_rate": 4.0021247592742554e-05, + "loss": 2.9978, + "step": 1926000 + }, + { + "epoch": 0.5988805767159335, + "grad_norm": 8.004718780517578, + "learning_rate": 4.001865705473444e-05, + "loss": 2.9074, + "step": 1926500 + }, + { + "epoch": 0.5990360089964204, + "grad_norm": 10.250131607055664, + "learning_rate": 4.001606651672633e-05, + "loss": 2.9432, + "step": 1927000 + }, + { + "epoch": 0.5991914412769073, + "grad_norm": 8.314830780029297, + "learning_rate": 4.001347597871821e-05, + "loss": 2.9525, + "step": 1927500 + }, + { + "epoch": 0.5993468735573941, + "grad_norm": 8.853450775146484, + "learning_rate": 4.00108854407101e-05, + "loss": 2.931, + "step": 1928000 + }, + { + "epoch": 0.599502305837881, + "grad_norm": 5.4689555168151855, + "learning_rate": 4.000829490270199e-05, + "loss": 2.9814, + "step": 1928500 + }, + { + "epoch": 0.5996577381183679, + "grad_norm": 8.299118995666504, + "learning_rate": 4.000570436469387e-05, + "loss": 2.9422, + "step": 1929000 + }, + { + "epoch": 0.5998131703988547, + "grad_norm": 9.519643783569336, + "learning_rate": 4.000311382668576e-05, + "loss": 2.9297, + "step": 1929500 + }, + { + "epoch": 0.5999686026793416, + "grad_norm": 8.392081260681152, + "learning_rate": 4.0000523288677645e-05, + "loss": 2.9846, + "step": 1930000 + }, + { + "epoch": 0.6001240349598286, + "grad_norm": 8.113951683044434, + "learning_rate": 3.9997932750669525e-05, + "loss": 2.9686, + "step": 1930500 + }, + { + "epoch": 0.6002794672403154, + "grad_norm": 7.506832599639893, + "learning_rate": 3.999534221266141e-05, + "loss": 2.9549, + "step": 1931000 + }, + { + "epoch": 0.6004348995208023, + "grad_norm": 8.570659637451172, + "learning_rate": 3.999275167465329e-05, + "loss": 2.9421, + "step": 1931500 + }, + { + "epoch": 0.6005903318012892, + "grad_norm": 8.395078659057617, + "learning_rate": 3.999016113664518e-05, + "loss": 2.9774, + "step": 1932000 + }, + { + "epoch": 0.600745764081776, + "grad_norm": 21.976804733276367, + "learning_rate": 3.998757059863707e-05, + "loss": 2.9855, + "step": 1932500 + }, + { + "epoch": 0.6009011963622629, + "grad_norm": 7.250117778778076, + "learning_rate": 3.9984980060628954e-05, + "loss": 2.956, + "step": 1933000 + }, + { + "epoch": 0.6010566286427498, + "grad_norm": 7.135907173156738, + "learning_rate": 3.998238952262084e-05, + "loss": 2.9655, + "step": 1933500 + }, + { + "epoch": 0.6012120609232366, + "grad_norm": 6.038496971130371, + "learning_rate": 3.997979898461273e-05, + "loss": 2.934, + "step": 1934000 + }, + { + "epoch": 0.6013674932037235, + "grad_norm": 8.590614318847656, + "learning_rate": 3.997720844660461e-05, + "loss": 2.9451, + "step": 1934500 + }, + { + "epoch": 0.6015229254842104, + "grad_norm": 8.694826126098633, + "learning_rate": 3.9974617908596496e-05, + "loss": 2.954, + "step": 1935000 + }, + { + "epoch": 0.6016783577646972, + "grad_norm": 9.280768394470215, + "learning_rate": 3.997202737058838e-05, + "loss": 2.9507, + "step": 1935500 + }, + { + "epoch": 0.6018337900451841, + "grad_norm": 8.271180152893066, + "learning_rate": 3.9969436832580264e-05, + "loss": 2.9897, + "step": 1936000 + }, + { + "epoch": 0.6019892223256711, + "grad_norm": 6.651460647583008, + "learning_rate": 3.996684629457215e-05, + "loss": 2.9455, + "step": 1936500 + }, + { + "epoch": 0.602144654606158, + "grad_norm": 8.45785140991211, + "learning_rate": 3.996425575656403e-05, + "loss": 2.9807, + "step": 1937000 + }, + { + "epoch": 0.6023000868866448, + "grad_norm": 9.463151931762695, + "learning_rate": 3.996166521855592e-05, + "loss": 3.0162, + "step": 1937500 + }, + { + "epoch": 0.6024555191671317, + "grad_norm": 15.010601997375488, + "learning_rate": 3.995907468054781e-05, + "loss": 2.9357, + "step": 1938000 + }, + { + "epoch": 0.6026109514476186, + "grad_norm": 8.600597381591797, + "learning_rate": 3.995648414253969e-05, + "loss": 2.9481, + "step": 1938500 + }, + { + "epoch": 0.6027663837281054, + "grad_norm": 8.165252685546875, + "learning_rate": 3.995389360453158e-05, + "loss": 2.9785, + "step": 1939000 + }, + { + "epoch": 0.6029218160085923, + "grad_norm": 7.588996410369873, + "learning_rate": 3.995130306652347e-05, + "loss": 2.9688, + "step": 1939500 + }, + { + "epoch": 0.6030772482890792, + "grad_norm": 9.840226173400879, + "learning_rate": 3.994871252851535e-05, + "loss": 2.9431, + "step": 1940000 + }, + { + "epoch": 0.603232680569566, + "grad_norm": 9.035603523254395, + "learning_rate": 3.9946121990507234e-05, + "loss": 2.9782, + "step": 1940500 + }, + { + "epoch": 0.6033881128500529, + "grad_norm": 6.424383640289307, + "learning_rate": 3.994353145249912e-05, + "loss": 2.9872, + "step": 1941000 + }, + { + "epoch": 0.6035435451305398, + "grad_norm": 11.839873313903809, + "learning_rate": 3.9940940914491e-05, + "loss": 2.9761, + "step": 1941500 + }, + { + "epoch": 0.6036989774110266, + "grad_norm": 15.162928581237793, + "learning_rate": 3.993835037648289e-05, + "loss": 3.0285, + "step": 1942000 + }, + { + "epoch": 0.6038544096915136, + "grad_norm": 8.399908065795898, + "learning_rate": 3.9935759838474776e-05, + "loss": 2.9468, + "step": 1942500 + }, + { + "epoch": 0.6040098419720005, + "grad_norm": 13.816254615783691, + "learning_rate": 3.9933169300466663e-05, + "loss": 2.9478, + "step": 1943000 + }, + { + "epoch": 0.6041652742524873, + "grad_norm": 9.6878080368042, + "learning_rate": 3.993057876245855e-05, + "loss": 2.9345, + "step": 1943500 + }, + { + "epoch": 0.6043207065329742, + "grad_norm": 8.000397682189941, + "learning_rate": 3.992798822445043e-05, + "loss": 2.9382, + "step": 1944000 + }, + { + "epoch": 0.6044761388134611, + "grad_norm": 8.461371421813965, + "learning_rate": 3.992539768644232e-05, + "loss": 2.9893, + "step": 1944500 + }, + { + "epoch": 0.6046315710939479, + "grad_norm": 21.892868041992188, + "learning_rate": 3.9922807148434205e-05, + "loss": 2.9485, + "step": 1945000 + }, + { + "epoch": 0.6047870033744348, + "grad_norm": 6.983076572418213, + "learning_rate": 3.9920216610426086e-05, + "loss": 2.9586, + "step": 1945500 + }, + { + "epoch": 0.6049424356549217, + "grad_norm": 6.84112024307251, + "learning_rate": 3.991762607241797e-05, + "loss": 2.9498, + "step": 1946000 + }, + { + "epoch": 0.6050978679354085, + "grad_norm": 8.20512580871582, + "learning_rate": 3.991503553440986e-05, + "loss": 2.97, + "step": 1946500 + }, + { + "epoch": 0.6052533002158954, + "grad_norm": 6.640162467956543, + "learning_rate": 3.991244499640174e-05, + "loss": 2.9703, + "step": 1947000 + }, + { + "epoch": 0.6054087324963823, + "grad_norm": 8.650099754333496, + "learning_rate": 3.990985445839363e-05, + "loss": 2.9465, + "step": 1947500 + }, + { + "epoch": 0.6055641647768691, + "grad_norm": 8.103534698486328, + "learning_rate": 3.990726392038552e-05, + "loss": 2.9461, + "step": 1948000 + }, + { + "epoch": 0.6057195970573561, + "grad_norm": 9.570183753967285, + "learning_rate": 3.99046733823774e-05, + "loss": 2.9811, + "step": 1948500 + }, + { + "epoch": 0.605875029337843, + "grad_norm": 8.505210876464844, + "learning_rate": 3.990208284436929e-05, + "loss": 2.9665, + "step": 1949000 + }, + { + "epoch": 0.6060304616183299, + "grad_norm": 10.285574913024902, + "learning_rate": 3.989949230636117e-05, + "loss": 2.9393, + "step": 1949500 + }, + { + "epoch": 0.6061858938988167, + "grad_norm": 7.685912132263184, + "learning_rate": 3.9896901768353056e-05, + "loss": 2.9233, + "step": 1950000 + }, + { + "epoch": 0.6063413261793036, + "grad_norm": 9.189940452575684, + "learning_rate": 3.9894311230344944e-05, + "loss": 2.9557, + "step": 1950500 + }, + { + "epoch": 0.6064967584597905, + "grad_norm": 8.406515121459961, + "learning_rate": 3.9891720692336824e-05, + "loss": 2.9146, + "step": 1951000 + }, + { + "epoch": 0.6066521907402773, + "grad_norm": 10.007396697998047, + "learning_rate": 3.988913015432871e-05, + "loss": 2.9696, + "step": 1951500 + }, + { + "epoch": 0.6068076230207642, + "grad_norm": 7.889960765838623, + "learning_rate": 3.98865396163206e-05, + "loss": 2.9898, + "step": 1952000 + }, + { + "epoch": 0.606963055301251, + "grad_norm": 10.727261543273926, + "learning_rate": 3.9883949078312485e-05, + "loss": 2.9369, + "step": 1952500 + }, + { + "epoch": 0.6071184875817379, + "grad_norm": 7.722128868103027, + "learning_rate": 3.988135854030437e-05, + "loss": 2.9755, + "step": 1953000 + }, + { + "epoch": 0.6072739198622248, + "grad_norm": 11.72795295715332, + "learning_rate": 3.987876800229626e-05, + "loss": 2.9581, + "step": 1953500 + }, + { + "epoch": 0.6074293521427117, + "grad_norm": 7.282704830169678, + "learning_rate": 3.987617746428814e-05, + "loss": 2.9426, + "step": 1954000 + }, + { + "epoch": 0.6075847844231986, + "grad_norm": 8.42080020904541, + "learning_rate": 3.987358692628003e-05, + "loss": 2.9525, + "step": 1954500 + }, + { + "epoch": 0.6077402167036855, + "grad_norm": 8.707021713256836, + "learning_rate": 3.987099638827191e-05, + "loss": 2.9737, + "step": 1955000 + }, + { + "epoch": 0.6078956489841724, + "grad_norm": 6.652725696563721, + "learning_rate": 3.9868405850263795e-05, + "loss": 2.9534, + "step": 1955500 + }, + { + "epoch": 0.6080510812646592, + "grad_norm": 8.543684959411621, + "learning_rate": 3.986581531225568e-05, + "loss": 2.931, + "step": 1956000 + }, + { + "epoch": 0.6082065135451461, + "grad_norm": 7.185638427734375, + "learning_rate": 3.986322477424756e-05, + "loss": 2.955, + "step": 1956500 + }, + { + "epoch": 0.608361945825633, + "grad_norm": 7.079722881317139, + "learning_rate": 3.986063423623945e-05, + "loss": 2.9546, + "step": 1957000 + }, + { + "epoch": 0.6085173781061198, + "grad_norm": 7.564360618591309, + "learning_rate": 3.985804369823134e-05, + "loss": 2.9351, + "step": 1957500 + }, + { + "epoch": 0.6086728103866067, + "grad_norm": 5.791171550750732, + "learning_rate": 3.9855453160223224e-05, + "loss": 3.0125, + "step": 1958000 + }, + { + "epoch": 0.6088282426670936, + "grad_norm": 14.266308784484863, + "learning_rate": 3.985286262221511e-05, + "loss": 2.9626, + "step": 1958500 + }, + { + "epoch": 0.6089836749475804, + "grad_norm": 9.982133865356445, + "learning_rate": 3.9850272084207e-05, + "loss": 2.9468, + "step": 1959000 + }, + { + "epoch": 0.6091391072280673, + "grad_norm": 12.761066436767578, + "learning_rate": 3.984768154619888e-05, + "loss": 2.9521, + "step": 1959500 + }, + { + "epoch": 0.6092945395085542, + "grad_norm": 8.840879440307617, + "learning_rate": 3.9845091008190766e-05, + "loss": 2.9297, + "step": 1960000 + }, + { + "epoch": 0.6094499717890411, + "grad_norm": 7.969274997711182, + "learning_rate": 3.9842500470182646e-05, + "loss": 2.9754, + "step": 1960500 + }, + { + "epoch": 0.609605404069528, + "grad_norm": 12.87790584564209, + "learning_rate": 3.983990993217453e-05, + "loss": 2.9758, + "step": 1961000 + }, + { + "epoch": 0.6097608363500149, + "grad_norm": 7.7402424812316895, + "learning_rate": 3.983731939416642e-05, + "loss": 2.9659, + "step": 1961500 + }, + { + "epoch": 0.6099162686305017, + "grad_norm": 9.43837833404541, + "learning_rate": 3.983472885615831e-05, + "loss": 2.9506, + "step": 1962000 + }, + { + "epoch": 0.6100717009109886, + "grad_norm": 9.16285228729248, + "learning_rate": 3.9832138318150195e-05, + "loss": 2.9544, + "step": 1962500 + }, + { + "epoch": 0.6102271331914755, + "grad_norm": 9.555452346801758, + "learning_rate": 3.982954778014208e-05, + "loss": 2.9705, + "step": 1963000 + }, + { + "epoch": 0.6103825654719623, + "grad_norm": 8.832555770874023, + "learning_rate": 3.982695724213396e-05, + "loss": 2.932, + "step": 1963500 + }, + { + "epoch": 0.6105379977524492, + "grad_norm": 8.088033676147461, + "learning_rate": 3.982436670412585e-05, + "loss": 2.962, + "step": 1964000 + }, + { + "epoch": 0.6106934300329361, + "grad_norm": 7.567655086517334, + "learning_rate": 3.9821776166117737e-05, + "loss": 2.9173, + "step": 1964500 + }, + { + "epoch": 0.610848862313423, + "grad_norm": 6.069338798522949, + "learning_rate": 3.981918562810962e-05, + "loss": 2.9324, + "step": 1965000 + }, + { + "epoch": 0.6110042945939098, + "grad_norm": 28.569210052490234, + "learning_rate": 3.9816595090101504e-05, + "loss": 3.0187, + "step": 1965500 + }, + { + "epoch": 0.6111597268743967, + "grad_norm": 9.471424102783203, + "learning_rate": 3.981400455209339e-05, + "loss": 2.9982, + "step": 1966000 + }, + { + "epoch": 0.6113151591548837, + "grad_norm": 7.987004280090332, + "learning_rate": 3.981141401408527e-05, + "loss": 2.9731, + "step": 1966500 + }, + { + "epoch": 0.6114705914353705, + "grad_norm": 7.408304691314697, + "learning_rate": 3.980882347607716e-05, + "loss": 2.9823, + "step": 1967000 + }, + { + "epoch": 0.6116260237158574, + "grad_norm": 8.406847953796387, + "learning_rate": 3.9806232938069046e-05, + "loss": 3.0032, + "step": 1967500 + }, + { + "epoch": 0.6117814559963443, + "grad_norm": 15.36120891571045, + "learning_rate": 3.980364240006093e-05, + "loss": 2.9691, + "step": 1968000 + }, + { + "epoch": 0.6119368882768311, + "grad_norm": 7.839089393615723, + "learning_rate": 3.980105186205282e-05, + "loss": 2.9829, + "step": 1968500 + }, + { + "epoch": 0.612092320557318, + "grad_norm": 7.907460689544678, + "learning_rate": 3.97984613240447e-05, + "loss": 2.9763, + "step": 1969000 + }, + { + "epoch": 0.6122477528378049, + "grad_norm": 5.151594638824463, + "learning_rate": 3.979587078603659e-05, + "loss": 2.9528, + "step": 1969500 + }, + { + "epoch": 0.6124031851182917, + "grad_norm": 9.753541946411133, + "learning_rate": 3.9793280248028475e-05, + "loss": 2.9391, + "step": 1970000 + }, + { + "epoch": 0.6125586173987786, + "grad_norm": 8.721147537231445, + "learning_rate": 3.9790689710020355e-05, + "loss": 2.9597, + "step": 1970500 + }, + { + "epoch": 0.6127140496792655, + "grad_norm": 10.077067375183105, + "learning_rate": 3.978809917201224e-05, + "loss": 2.9494, + "step": 1971000 + }, + { + "epoch": 0.6128694819597523, + "grad_norm": 10.094767570495605, + "learning_rate": 3.978550863400413e-05, + "loss": 2.9696, + "step": 1971500 + }, + { + "epoch": 0.6130249142402392, + "grad_norm": 7.833797931671143, + "learning_rate": 3.978291809599602e-05, + "loss": 2.997, + "step": 1972000 + }, + { + "epoch": 0.6131803465207262, + "grad_norm": 13.059793472290039, + "learning_rate": 3.9780327557987904e-05, + "loss": 2.9395, + "step": 1972500 + }, + { + "epoch": 0.613335778801213, + "grad_norm": 7.182812213897705, + "learning_rate": 3.9777737019979784e-05, + "loss": 3.0096, + "step": 1973000 + }, + { + "epoch": 0.6134912110816999, + "grad_norm": 9.043205261230469, + "learning_rate": 3.977514648197167e-05, + "loss": 3.0234, + "step": 1973500 + }, + { + "epoch": 0.6136466433621868, + "grad_norm": 16.943164825439453, + "learning_rate": 3.977255594396356e-05, + "loss": 2.954, + "step": 1974000 + }, + { + "epoch": 0.6138020756426736, + "grad_norm": 8.593246459960938, + "learning_rate": 3.976996540595544e-05, + "loss": 2.9258, + "step": 1974500 + }, + { + "epoch": 0.6139575079231605, + "grad_norm": 7.590203285217285, + "learning_rate": 3.9767374867947326e-05, + "loss": 2.9574, + "step": 1975000 + }, + { + "epoch": 0.6141129402036474, + "grad_norm": 6.839503288269043, + "learning_rate": 3.976478432993921e-05, + "loss": 2.9701, + "step": 1975500 + }, + { + "epoch": 0.6142683724841342, + "grad_norm": 10.51854419708252, + "learning_rate": 3.9762193791931094e-05, + "loss": 2.9379, + "step": 1976000 + }, + { + "epoch": 0.6144238047646211, + "grad_norm": 9.349201202392578, + "learning_rate": 3.975960325392298e-05, + "loss": 2.9853, + "step": 1976500 + }, + { + "epoch": 0.614579237045108, + "grad_norm": 8.441229820251465, + "learning_rate": 3.975701271591487e-05, + "loss": 2.9547, + "step": 1977000 + }, + { + "epoch": 0.6147346693255948, + "grad_norm": 11.481546401977539, + "learning_rate": 3.9754422177906755e-05, + "loss": 2.9649, + "step": 1977500 + }, + { + "epoch": 0.6148901016060817, + "grad_norm": 9.400917053222656, + "learning_rate": 3.975183163989864e-05, + "loss": 2.9634, + "step": 1978000 + }, + { + "epoch": 0.6150455338865687, + "grad_norm": 9.595915794372559, + "learning_rate": 3.974924110189052e-05, + "loss": 2.9383, + "step": 1978500 + }, + { + "epoch": 0.6152009661670556, + "grad_norm": 9.041508674621582, + "learning_rate": 3.974665056388241e-05, + "loss": 2.9688, + "step": 1979000 + }, + { + "epoch": 0.6153563984475424, + "grad_norm": 9.630517959594727, + "learning_rate": 3.97440600258743e-05, + "loss": 2.9854, + "step": 1979500 + }, + { + "epoch": 0.6155118307280293, + "grad_norm": 9.31422233581543, + "learning_rate": 3.974146948786618e-05, + "loss": 2.9641, + "step": 1980000 + }, + { + "epoch": 0.6156672630085162, + "grad_norm": 7.594694137573242, + "learning_rate": 3.9738878949858064e-05, + "loss": 2.9977, + "step": 1980500 + }, + { + "epoch": 0.615822695289003, + "grad_norm": 9.093639373779297, + "learning_rate": 3.973628841184995e-05, + "loss": 2.9927, + "step": 1981000 + }, + { + "epoch": 0.6159781275694899, + "grad_norm": 8.502300262451172, + "learning_rate": 3.973369787384183e-05, + "loss": 2.9464, + "step": 1981500 + }, + { + "epoch": 0.6161335598499768, + "grad_norm": 9.988579750061035, + "learning_rate": 3.9731107335833726e-05, + "loss": 2.9751, + "step": 1982000 + }, + { + "epoch": 0.6162889921304636, + "grad_norm": 7.303534507751465, + "learning_rate": 3.972851679782561e-05, + "loss": 3.0057, + "step": 1982500 + }, + { + "epoch": 0.6164444244109505, + "grad_norm": 8.146364212036133, + "learning_rate": 3.9725926259817493e-05, + "loss": 2.9628, + "step": 1983000 + }, + { + "epoch": 0.6165998566914374, + "grad_norm": 8.856060981750488, + "learning_rate": 3.972333572180938e-05, + "loss": 2.9836, + "step": 1983500 + }, + { + "epoch": 0.6167552889719242, + "grad_norm": 9.460158348083496, + "learning_rate": 3.972074518380127e-05, + "loss": 2.9635, + "step": 1984000 + }, + { + "epoch": 0.6169107212524112, + "grad_norm": 9.588123321533203, + "learning_rate": 3.971815464579315e-05, + "loss": 2.9733, + "step": 1984500 + }, + { + "epoch": 0.6170661535328981, + "grad_norm": 6.7796854972839355, + "learning_rate": 3.9715564107785035e-05, + "loss": 2.9716, + "step": 1985000 + }, + { + "epoch": 0.6172215858133849, + "grad_norm": 6.666921615600586, + "learning_rate": 3.9712973569776916e-05, + "loss": 2.9715, + "step": 1985500 + }, + { + "epoch": 0.6173770180938718, + "grad_norm": 8.003673553466797, + "learning_rate": 3.97103830317688e-05, + "loss": 2.985, + "step": 1986000 + }, + { + "epoch": 0.6175324503743587, + "grad_norm": 12.054204940795898, + "learning_rate": 3.970779249376069e-05, + "loss": 2.9305, + "step": 1986500 + }, + { + "epoch": 0.6176878826548455, + "grad_norm": 7.384702682495117, + "learning_rate": 3.970520195575258e-05, + "loss": 2.9617, + "step": 1987000 + }, + { + "epoch": 0.6178433149353324, + "grad_norm": 10.961606979370117, + "learning_rate": 3.9702611417744464e-05, + "loss": 2.9733, + "step": 1987500 + }, + { + "epoch": 0.6179987472158193, + "grad_norm": 8.802166938781738, + "learning_rate": 3.970002087973635e-05, + "loss": 2.9808, + "step": 1988000 + }, + { + "epoch": 0.6181541794963061, + "grad_norm": 10.397404670715332, + "learning_rate": 3.969743034172823e-05, + "loss": 2.9832, + "step": 1988500 + }, + { + "epoch": 0.618309611776793, + "grad_norm": 9.420034408569336, + "learning_rate": 3.969483980372012e-05, + "loss": 2.9759, + "step": 1989000 + }, + { + "epoch": 0.6184650440572799, + "grad_norm": 10.395437240600586, + "learning_rate": 3.9692249265712006e-05, + "loss": 2.9702, + "step": 1989500 + }, + { + "epoch": 0.6186204763377667, + "grad_norm": 7.10263729095459, + "learning_rate": 3.9689658727703886e-05, + "loss": 2.9364, + "step": 1990000 + }, + { + "epoch": 0.6187759086182537, + "grad_norm": 11.498122215270996, + "learning_rate": 3.9687068189695774e-05, + "loss": 2.9466, + "step": 1990500 + }, + { + "epoch": 0.6189313408987406, + "grad_norm": 35.4597053527832, + "learning_rate": 3.9684477651687654e-05, + "loss": 2.9653, + "step": 1991000 + }, + { + "epoch": 0.6190867731792274, + "grad_norm": 9.75507640838623, + "learning_rate": 3.968188711367954e-05, + "loss": 2.9974, + "step": 1991500 + }, + { + "epoch": 0.6192422054597143, + "grad_norm": 10.218605995178223, + "learning_rate": 3.9679296575671435e-05, + "loss": 2.999, + "step": 1992000 + }, + { + "epoch": 0.6193976377402012, + "grad_norm": 9.526650428771973, + "learning_rate": 3.9676706037663315e-05, + "loss": 2.9704, + "step": 1992500 + }, + { + "epoch": 0.619553070020688, + "grad_norm": 8.087902069091797, + "learning_rate": 3.96741154996552e-05, + "loss": 2.9831, + "step": 1993000 + }, + { + "epoch": 0.6197085023011749, + "grad_norm": 8.432071685791016, + "learning_rate": 3.967152496164709e-05, + "loss": 2.9765, + "step": 1993500 + }, + { + "epoch": 0.6198639345816618, + "grad_norm": 7.875052452087402, + "learning_rate": 3.966893442363897e-05, + "loss": 2.9371, + "step": 1994000 + }, + { + "epoch": 0.6200193668621486, + "grad_norm": 8.23984432220459, + "learning_rate": 3.966634388563086e-05, + "loss": 2.9393, + "step": 1994500 + }, + { + "epoch": 0.6201747991426355, + "grad_norm": 15.545799255371094, + "learning_rate": 3.9663753347622744e-05, + "loss": 2.9243, + "step": 1995000 + }, + { + "epoch": 0.6203302314231224, + "grad_norm": 8.573948860168457, + "learning_rate": 3.9661162809614625e-05, + "loss": 2.9653, + "step": 1995500 + }, + { + "epoch": 0.6204856637036092, + "grad_norm": 7.939562797546387, + "learning_rate": 3.965857227160651e-05, + "loss": 2.925, + "step": 1996000 + }, + { + "epoch": 0.6206410959840961, + "grad_norm": 6.8238725662231445, + "learning_rate": 3.96559817335984e-05, + "loss": 3.0116, + "step": 1996500 + }, + { + "epoch": 0.6207965282645831, + "grad_norm": 8.564602851867676, + "learning_rate": 3.9653391195590286e-05, + "loss": 2.9435, + "step": 1997000 + }, + { + "epoch": 0.62095196054507, + "grad_norm": 6.975132465362549, + "learning_rate": 3.9650800657582173e-05, + "loss": 2.9636, + "step": 1997500 + }, + { + "epoch": 0.6211073928255568, + "grad_norm": 9.32268238067627, + "learning_rate": 3.9648210119574054e-05, + "loss": 2.9155, + "step": 1998000 + }, + { + "epoch": 0.6212628251060437, + "grad_norm": 9.059436798095703, + "learning_rate": 3.964561958156594e-05, + "loss": 2.9352, + "step": 1998500 + }, + { + "epoch": 0.6214182573865306, + "grad_norm": 12.609841346740723, + "learning_rate": 3.964302904355783e-05, + "loss": 2.9522, + "step": 1999000 + }, + { + "epoch": 0.6215736896670174, + "grad_norm": 13.839472770690918, + "learning_rate": 3.964043850554971e-05, + "loss": 2.9964, + "step": 1999500 + }, + { + "epoch": 0.6217291219475043, + "grad_norm": 8.840629577636719, + "learning_rate": 3.9637847967541596e-05, + "loss": 2.9491, + "step": 2000000 + }, + { + "epoch": 0.6218845542279912, + "grad_norm": 9.405591011047363, + "learning_rate": 3.963525742953348e-05, + "loss": 2.9078, + "step": 2000500 + }, + { + "epoch": 0.622039986508478, + "grad_norm": 7.740964889526367, + "learning_rate": 3.963266689152536e-05, + "loss": 2.9953, + "step": 2001000 + }, + { + "epoch": 0.6221954187889649, + "grad_norm": 9.654718399047852, + "learning_rate": 3.963007635351725e-05, + "loss": 2.9468, + "step": 2001500 + }, + { + "epoch": 0.6223508510694518, + "grad_norm": 7.721446514129639, + "learning_rate": 3.9627485815509144e-05, + "loss": 2.9791, + "step": 2002000 + }, + { + "epoch": 0.6225062833499386, + "grad_norm": 8.817960739135742, + "learning_rate": 3.9624895277501025e-05, + "loss": 2.9507, + "step": 2002500 + }, + { + "epoch": 0.6226617156304256, + "grad_norm": 9.394454956054688, + "learning_rate": 3.962230473949291e-05, + "loss": 2.9197, + "step": 2003000 + }, + { + "epoch": 0.6228171479109125, + "grad_norm": 9.776477813720703, + "learning_rate": 3.961971420148479e-05, + "loss": 2.9441, + "step": 2003500 + }, + { + "epoch": 0.6229725801913993, + "grad_norm": 8.220660209655762, + "learning_rate": 3.961712366347668e-05, + "loss": 2.9664, + "step": 2004000 + }, + { + "epoch": 0.6231280124718862, + "grad_norm": 7.078343868255615, + "learning_rate": 3.9614533125468566e-05, + "loss": 2.9925, + "step": 2004500 + }, + { + "epoch": 0.6232834447523731, + "grad_norm": 8.524077415466309, + "learning_rate": 3.961194258746045e-05, + "loss": 2.975, + "step": 2005000 + }, + { + "epoch": 0.6234388770328599, + "grad_norm": 9.522829055786133, + "learning_rate": 3.9609352049452334e-05, + "loss": 3.0005, + "step": 2005500 + }, + { + "epoch": 0.6235943093133468, + "grad_norm": 10.977128982543945, + "learning_rate": 3.960676151144422e-05, + "loss": 2.9401, + "step": 2006000 + }, + { + "epoch": 0.6237497415938337, + "grad_norm": 7.9097819328308105, + "learning_rate": 3.960417097343611e-05, + "loss": 2.9136, + "step": 2006500 + }, + { + "epoch": 0.6239051738743205, + "grad_norm": 9.48869514465332, + "learning_rate": 3.9601580435427995e-05, + "loss": 2.9765, + "step": 2007000 + }, + { + "epoch": 0.6240606061548074, + "grad_norm": 13.881948471069336, + "learning_rate": 3.959898989741988e-05, + "loss": 2.9235, + "step": 2007500 + }, + { + "epoch": 0.6242160384352943, + "grad_norm": 11.537532806396484, + "learning_rate": 3.959639935941176e-05, + "loss": 2.9804, + "step": 2008000 + }, + { + "epoch": 0.6243714707157811, + "grad_norm": 6.298761367797852, + "learning_rate": 3.959380882140365e-05, + "loss": 2.9331, + "step": 2008500 + }, + { + "epoch": 0.6245269029962681, + "grad_norm": 6.741898536682129, + "learning_rate": 3.959121828339553e-05, + "loss": 2.9719, + "step": 2009000 + }, + { + "epoch": 0.624682335276755, + "grad_norm": 7.259893417358398, + "learning_rate": 3.958862774538742e-05, + "loss": 2.9924, + "step": 2009500 + }, + { + "epoch": 0.6248377675572419, + "grad_norm": 8.425070762634277, + "learning_rate": 3.9586037207379305e-05, + "loss": 2.9722, + "step": 2010000 + }, + { + "epoch": 0.6249931998377287, + "grad_norm": 7.61340856552124, + "learning_rate": 3.9583446669371185e-05, + "loss": 2.9967, + "step": 2010500 + }, + { + "epoch": 0.6251486321182156, + "grad_norm": 8.340291023254395, + "learning_rate": 3.958085613136307e-05, + "loss": 2.9669, + "step": 2011000 + }, + { + "epoch": 0.6253040643987025, + "grad_norm": 6.658201217651367, + "learning_rate": 3.957826559335496e-05, + "loss": 2.9352, + "step": 2011500 + }, + { + "epoch": 0.6254594966791893, + "grad_norm": 7.754528522491455, + "learning_rate": 3.957567505534685e-05, + "loss": 2.9382, + "step": 2012000 + }, + { + "epoch": 0.6256149289596762, + "grad_norm": 22.970285415649414, + "learning_rate": 3.9573084517338734e-05, + "loss": 2.9811, + "step": 2012500 + }, + { + "epoch": 0.625770361240163, + "grad_norm": 8.663495063781738, + "learning_rate": 3.957049397933062e-05, + "loss": 2.979, + "step": 2013000 + }, + { + "epoch": 0.6259257935206499, + "grad_norm": 7.458810329437256, + "learning_rate": 3.95679034413225e-05, + "loss": 2.9787, + "step": 2013500 + }, + { + "epoch": 0.6260812258011368, + "grad_norm": 10.381972312927246, + "learning_rate": 3.956531290331439e-05, + "loss": 2.9964, + "step": 2014000 + }, + { + "epoch": 0.6262366580816237, + "grad_norm": 8.872797012329102, + "learning_rate": 3.9562722365306276e-05, + "loss": 2.9839, + "step": 2014500 + }, + { + "epoch": 0.6263920903621106, + "grad_norm": 8.214437484741211, + "learning_rate": 3.9560131827298156e-05, + "loss": 2.9542, + "step": 2015000 + }, + { + "epoch": 0.6265475226425975, + "grad_norm": 7.56722354888916, + "learning_rate": 3.955754128929004e-05, + "loss": 3.0, + "step": 2015500 + }, + { + "epoch": 0.6267029549230844, + "grad_norm": 11.07796573638916, + "learning_rate": 3.955495075128193e-05, + "loss": 2.979, + "step": 2016000 + }, + { + "epoch": 0.6268583872035712, + "grad_norm": 6.196483612060547, + "learning_rate": 3.955236021327382e-05, + "loss": 2.8808, + "step": 2016500 + }, + { + "epoch": 0.6270138194840581, + "grad_norm": 8.914560317993164, + "learning_rate": 3.9549769675265705e-05, + "loss": 2.9839, + "step": 2017000 + }, + { + "epoch": 0.627169251764545, + "grad_norm": 7.020608901977539, + "learning_rate": 3.9547179137257585e-05, + "loss": 2.9605, + "step": 2017500 + }, + { + "epoch": 0.6273246840450318, + "grad_norm": 8.857983589172363, + "learning_rate": 3.954458859924947e-05, + "loss": 2.9769, + "step": 2018000 + }, + { + "epoch": 0.6274801163255187, + "grad_norm": 9.947704315185547, + "learning_rate": 3.954199806124136e-05, + "loss": 2.9502, + "step": 2018500 + }, + { + "epoch": 0.6276355486060056, + "grad_norm": 20.203540802001953, + "learning_rate": 3.953940752323324e-05, + "loss": 2.9459, + "step": 2019000 + }, + { + "epoch": 0.6277909808864924, + "grad_norm": 9.408906936645508, + "learning_rate": 3.953681698522513e-05, + "loss": 2.9152, + "step": 2019500 + }, + { + "epoch": 0.6279464131669793, + "grad_norm": 11.240312576293945, + "learning_rate": 3.9534226447217014e-05, + "loss": 2.9549, + "step": 2020000 + }, + { + "epoch": 0.6281018454474662, + "grad_norm": 8.772941589355469, + "learning_rate": 3.9531635909208894e-05, + "loss": 3.001, + "step": 2020500 + }, + { + "epoch": 0.6282572777279531, + "grad_norm": 7.741700649261475, + "learning_rate": 3.952904537120078e-05, + "loss": 2.9772, + "step": 2021000 + }, + { + "epoch": 0.62841271000844, + "grad_norm": 8.880668640136719, + "learning_rate": 3.952645483319267e-05, + "loss": 2.9577, + "step": 2021500 + }, + { + "epoch": 0.6285681422889269, + "grad_norm": 12.446510314941406, + "learning_rate": 3.9523864295184556e-05, + "loss": 2.9503, + "step": 2022000 + }, + { + "epoch": 0.6287235745694137, + "grad_norm": 7.1574530601501465, + "learning_rate": 3.952127375717644e-05, + "loss": 2.9972, + "step": 2022500 + }, + { + "epoch": 0.6288790068499006, + "grad_norm": 15.000044822692871, + "learning_rate": 3.9518683219168323e-05, + "loss": 2.9248, + "step": 2023000 + }, + { + "epoch": 0.6290344391303875, + "grad_norm": 9.70949649810791, + "learning_rate": 3.951609268116021e-05, + "loss": 2.9449, + "step": 2023500 + }, + { + "epoch": 0.6291898714108743, + "grad_norm": 10.410236358642578, + "learning_rate": 3.95135021431521e-05, + "loss": 2.99, + "step": 2024000 + }, + { + "epoch": 0.6293453036913612, + "grad_norm": 6.319430828094482, + "learning_rate": 3.951091160514398e-05, + "loss": 2.9441, + "step": 2024500 + }, + { + "epoch": 0.6295007359718481, + "grad_norm": 10.251976013183594, + "learning_rate": 3.9508321067135865e-05, + "loss": 2.9093, + "step": 2025000 + }, + { + "epoch": 0.629656168252335, + "grad_norm": 7.454419136047363, + "learning_rate": 3.950573052912775e-05, + "loss": 2.9923, + "step": 2025500 + }, + { + "epoch": 0.6298116005328218, + "grad_norm": 7.927022457122803, + "learning_rate": 3.950313999111964e-05, + "loss": 2.9085, + "step": 2026000 + }, + { + "epoch": 0.6299670328133087, + "grad_norm": 14.19694995880127, + "learning_rate": 3.950054945311153e-05, + "loss": 2.9535, + "step": 2026500 + }, + { + "epoch": 0.6301224650937957, + "grad_norm": 10.217572212219238, + "learning_rate": 3.949795891510341e-05, + "loss": 2.9606, + "step": 2027000 + }, + { + "epoch": 0.6302778973742825, + "grad_norm": 8.756054878234863, + "learning_rate": 3.9495368377095294e-05, + "loss": 2.9621, + "step": 2027500 + }, + { + "epoch": 0.6304333296547694, + "grad_norm": 10.145346641540527, + "learning_rate": 3.949277783908718e-05, + "loss": 2.9281, + "step": 2028000 + }, + { + "epoch": 0.6305887619352563, + "grad_norm": 11.156573295593262, + "learning_rate": 3.949018730107906e-05, + "loss": 2.9577, + "step": 2028500 + }, + { + "epoch": 0.6307441942157431, + "grad_norm": 8.519734382629395, + "learning_rate": 3.948759676307095e-05, + "loss": 2.9581, + "step": 2029000 + }, + { + "epoch": 0.63089962649623, + "grad_norm": 23.921628952026367, + "learning_rate": 3.9485006225062836e-05, + "loss": 2.924, + "step": 2029500 + }, + { + "epoch": 0.6310550587767169, + "grad_norm": 7.316037654876709, + "learning_rate": 3.9482415687054716e-05, + "loss": 2.9513, + "step": 2030000 + }, + { + "epoch": 0.6312104910572037, + "grad_norm": 15.584917068481445, + "learning_rate": 3.9479825149046604e-05, + "loss": 2.9266, + "step": 2030500 + }, + { + "epoch": 0.6313659233376906, + "grad_norm": 9.849332809448242, + "learning_rate": 3.947723461103849e-05, + "loss": 2.946, + "step": 2031000 + }, + { + "epoch": 0.6315213556181775, + "grad_norm": 7.999297142028809, + "learning_rate": 3.947464407303038e-05, + "loss": 2.9897, + "step": 2031500 + }, + { + "epoch": 0.6316767878986643, + "grad_norm": 8.943427085876465, + "learning_rate": 3.9472053535022265e-05, + "loss": 2.9714, + "step": 2032000 + }, + { + "epoch": 0.6318322201791512, + "grad_norm": 42.911048889160156, + "learning_rate": 3.946946299701415e-05, + "loss": 2.9392, + "step": 2032500 + }, + { + "epoch": 0.6319876524596382, + "grad_norm": 6.202180862426758, + "learning_rate": 3.946687245900603e-05, + "loss": 2.941, + "step": 2033000 + }, + { + "epoch": 0.632143084740125, + "grad_norm": 10.203930854797363, + "learning_rate": 3.946428192099792e-05, + "loss": 2.943, + "step": 2033500 + }, + { + "epoch": 0.6322985170206119, + "grad_norm": 48.69913864135742, + "learning_rate": 3.94616913829898e-05, + "loss": 2.9387, + "step": 2034000 + }, + { + "epoch": 0.6324539493010988, + "grad_norm": 9.145930290222168, + "learning_rate": 3.945910084498169e-05, + "loss": 2.9298, + "step": 2034500 + }, + { + "epoch": 0.6326093815815856, + "grad_norm": 9.595843315124512, + "learning_rate": 3.9456510306973574e-05, + "loss": 2.9293, + "step": 2035000 + }, + { + "epoch": 0.6327648138620725, + "grad_norm": 8.072699546813965, + "learning_rate": 3.945391976896546e-05, + "loss": 2.9561, + "step": 2035500 + }, + { + "epoch": 0.6329202461425594, + "grad_norm": 9.094868659973145, + "learning_rate": 3.945132923095735e-05, + "loss": 2.9232, + "step": 2036000 + }, + { + "epoch": 0.6330756784230462, + "grad_norm": 9.247415542602539, + "learning_rate": 3.9448738692949236e-05, + "loss": 2.9891, + "step": 2036500 + }, + { + "epoch": 0.6332311107035331, + "grad_norm": 11.757457733154297, + "learning_rate": 3.9446148154941116e-05, + "loss": 2.9337, + "step": 2037000 + }, + { + "epoch": 0.63338654298402, + "grad_norm": 7.318173885345459, + "learning_rate": 3.9443557616933003e-05, + "loss": 2.9154, + "step": 2037500 + }, + { + "epoch": 0.6335419752645068, + "grad_norm": 15.271185874938965, + "learning_rate": 3.944096707892489e-05, + "loss": 2.9683, + "step": 2038000 + }, + { + "epoch": 0.6336974075449937, + "grad_norm": 25.86871910095215, + "learning_rate": 3.943837654091677e-05, + "loss": 3.0026, + "step": 2038500 + }, + { + "epoch": 0.6338528398254807, + "grad_norm": 9.291749000549316, + "learning_rate": 3.943578600290866e-05, + "loss": 3.0227, + "step": 2039000 + }, + { + "epoch": 0.6340082721059676, + "grad_norm": 8.373222351074219, + "learning_rate": 3.943319546490054e-05, + "loss": 2.9392, + "step": 2039500 + }, + { + "epoch": 0.6341637043864544, + "grad_norm": 9.10552978515625, + "learning_rate": 3.9430604926892426e-05, + "loss": 2.9259, + "step": 2040000 + }, + { + "epoch": 0.6343191366669413, + "grad_norm": 10.379484176635742, + "learning_rate": 3.942801438888431e-05, + "loss": 2.9492, + "step": 2040500 + }, + { + "epoch": 0.6344745689474282, + "grad_norm": 8.87575912475586, + "learning_rate": 3.94254238508762e-05, + "loss": 2.8818, + "step": 2041000 + }, + { + "epoch": 0.634630001227915, + "grad_norm": 6.534691333770752, + "learning_rate": 3.942283331286809e-05, + "loss": 3.0052, + "step": 2041500 + }, + { + "epoch": 0.6347854335084019, + "grad_norm": 8.149225234985352, + "learning_rate": 3.9420242774859974e-05, + "loss": 2.951, + "step": 2042000 + }, + { + "epoch": 0.6349408657888888, + "grad_norm": 10.06755542755127, + "learning_rate": 3.9417652236851855e-05, + "loss": 2.9411, + "step": 2042500 + }, + { + "epoch": 0.6350962980693756, + "grad_norm": 80.64075469970703, + "learning_rate": 3.941506169884374e-05, + "loss": 2.9943, + "step": 2043000 + }, + { + "epoch": 0.6352517303498625, + "grad_norm": 10.652227401733398, + "learning_rate": 3.941247116083563e-05, + "loss": 2.9811, + "step": 2043500 + }, + { + "epoch": 0.6354071626303494, + "grad_norm": 9.11721134185791, + "learning_rate": 3.940988062282751e-05, + "loss": 2.9323, + "step": 2044000 + }, + { + "epoch": 0.6355625949108362, + "grad_norm": 19.214252471923828, + "learning_rate": 3.9407290084819396e-05, + "loss": 2.9074, + "step": 2044500 + }, + { + "epoch": 0.6357180271913232, + "grad_norm": 23.355005264282227, + "learning_rate": 3.940469954681128e-05, + "loss": 2.9535, + "step": 2045000 + }, + { + "epoch": 0.6358734594718101, + "grad_norm": 7.301501274108887, + "learning_rate": 3.940210900880317e-05, + "loss": 2.9094, + "step": 2045500 + }, + { + "epoch": 0.6360288917522969, + "grad_norm": 14.353507995605469, + "learning_rate": 3.939951847079506e-05, + "loss": 2.9368, + "step": 2046000 + }, + { + "epoch": 0.6361843240327838, + "grad_norm": 6.768465042114258, + "learning_rate": 3.939692793278694e-05, + "loss": 2.9635, + "step": 2046500 + }, + { + "epoch": 0.6363397563132707, + "grad_norm": 8.247093200683594, + "learning_rate": 3.9394337394778825e-05, + "loss": 2.9059, + "step": 2047000 + }, + { + "epoch": 0.6364951885937575, + "grad_norm": 8.063271522521973, + "learning_rate": 3.939174685677071e-05, + "loss": 2.9585, + "step": 2047500 + }, + { + "epoch": 0.6366506208742444, + "grad_norm": 6.235685348510742, + "learning_rate": 3.938915631876259e-05, + "loss": 2.9476, + "step": 2048000 + }, + { + "epoch": 0.6368060531547313, + "grad_norm": 9.470033645629883, + "learning_rate": 3.938656578075448e-05, + "loss": 2.9121, + "step": 2048500 + }, + { + "epoch": 0.6369614854352181, + "grad_norm": 8.24569034576416, + "learning_rate": 3.938397524274637e-05, + "loss": 2.976, + "step": 2049000 + }, + { + "epoch": 0.637116917715705, + "grad_norm": 9.86475944519043, + "learning_rate": 3.938138470473825e-05, + "loss": 2.9399, + "step": 2049500 + }, + { + "epoch": 0.6372723499961919, + "grad_norm": 11.834066390991211, + "learning_rate": 3.9378794166730135e-05, + "loss": 2.9261, + "step": 2050000 + }, + { + "epoch": 0.6374277822766787, + "grad_norm": 8.565091133117676, + "learning_rate": 3.937620362872202e-05, + "loss": 2.9572, + "step": 2050500 + }, + { + "epoch": 0.6375832145571657, + "grad_norm": 14.757675170898438, + "learning_rate": 3.937361309071391e-05, + "loss": 2.9849, + "step": 2051000 + }, + { + "epoch": 0.6377386468376526, + "grad_norm": 8.59937858581543, + "learning_rate": 3.9371022552705796e-05, + "loss": 2.9349, + "step": 2051500 + }, + { + "epoch": 0.6378940791181394, + "grad_norm": 10.007719039916992, + "learning_rate": 3.936843201469768e-05, + "loss": 2.9347, + "step": 2052000 + }, + { + "epoch": 0.6380495113986263, + "grad_norm": 9.384624481201172, + "learning_rate": 3.9365841476689564e-05, + "loss": 2.9065, + "step": 2052500 + }, + { + "epoch": 0.6382049436791132, + "grad_norm": 10.208598136901855, + "learning_rate": 3.936325093868145e-05, + "loss": 2.9591, + "step": 2053000 + }, + { + "epoch": 0.6383603759596, + "grad_norm": 8.456744194030762, + "learning_rate": 3.936066040067333e-05, + "loss": 2.9413, + "step": 2053500 + }, + { + "epoch": 0.6385158082400869, + "grad_norm": 7.474422454833984, + "learning_rate": 3.935806986266522e-05, + "loss": 2.9567, + "step": 2054000 + }, + { + "epoch": 0.6386712405205738, + "grad_norm": 10.101318359375, + "learning_rate": 3.9355479324657106e-05, + "loss": 2.9836, + "step": 2054500 + }, + { + "epoch": 0.6388266728010606, + "grad_norm": 10.949272155761719, + "learning_rate": 3.9352888786648986e-05, + "loss": 2.959, + "step": 2055000 + }, + { + "epoch": 0.6389821050815475, + "grad_norm": 7.430112838745117, + "learning_rate": 3.935029824864088e-05, + "loss": 2.9354, + "step": 2055500 + }, + { + "epoch": 0.6391375373620344, + "grad_norm": 6.620362758636475, + "learning_rate": 3.934770771063277e-05, + "loss": 2.9255, + "step": 2056000 + }, + { + "epoch": 0.6392929696425212, + "grad_norm": 10.54958438873291, + "learning_rate": 3.934511717262465e-05, + "loss": 2.9385, + "step": 2056500 + }, + { + "epoch": 0.6394484019230082, + "grad_norm": 9.330659866333008, + "learning_rate": 3.9342526634616535e-05, + "loss": 2.9803, + "step": 2057000 + }, + { + "epoch": 0.6396038342034951, + "grad_norm": 7.3835649490356445, + "learning_rate": 3.9339936096608415e-05, + "loss": 2.9677, + "step": 2057500 + }, + { + "epoch": 0.639759266483982, + "grad_norm": 8.244956016540527, + "learning_rate": 3.93373455586003e-05, + "loss": 2.9004, + "step": 2058000 + }, + { + "epoch": 0.6399146987644688, + "grad_norm": 20.965457916259766, + "learning_rate": 3.933475502059219e-05, + "loss": 2.9536, + "step": 2058500 + }, + { + "epoch": 0.6400701310449557, + "grad_norm": 8.640542984008789, + "learning_rate": 3.933216448258407e-05, + "loss": 2.9213, + "step": 2059000 + }, + { + "epoch": 0.6402255633254426, + "grad_norm": 7.726798057556152, + "learning_rate": 3.932957394457596e-05, + "loss": 2.9846, + "step": 2059500 + }, + { + "epoch": 0.6403809956059294, + "grad_norm": 18.17774772644043, + "learning_rate": 3.9326983406567844e-05, + "loss": 2.9734, + "step": 2060000 + }, + { + "epoch": 0.6405364278864163, + "grad_norm": 10.57304573059082, + "learning_rate": 3.932439286855973e-05, + "loss": 2.9884, + "step": 2060500 + }, + { + "epoch": 0.6406918601669032, + "grad_norm": 9.399552345275879, + "learning_rate": 3.932180233055162e-05, + "loss": 2.9235, + "step": 2061000 + }, + { + "epoch": 0.64084729244739, + "grad_norm": 9.491347312927246, + "learning_rate": 3.9319211792543505e-05, + "loss": 2.9826, + "step": 2061500 + }, + { + "epoch": 0.6410027247278769, + "grad_norm": 11.178973197937012, + "learning_rate": 3.9316621254535386e-05, + "loss": 2.9271, + "step": 2062000 + }, + { + "epoch": 0.6411581570083638, + "grad_norm": 15.178946495056152, + "learning_rate": 3.931403071652727e-05, + "loss": 2.9354, + "step": 2062500 + }, + { + "epoch": 0.6413135892888507, + "grad_norm": 7.635277271270752, + "learning_rate": 3.931144017851915e-05, + "loss": 2.9493, + "step": 2063000 + }, + { + "epoch": 0.6414690215693376, + "grad_norm": 7.677488327026367, + "learning_rate": 3.930884964051104e-05, + "loss": 2.9513, + "step": 2063500 + }, + { + "epoch": 0.6416244538498245, + "grad_norm": 11.260096549987793, + "learning_rate": 3.930625910250293e-05, + "loss": 2.9668, + "step": 2064000 + }, + { + "epoch": 0.6417798861303113, + "grad_norm": 12.506056785583496, + "learning_rate": 3.930366856449481e-05, + "loss": 2.9812, + "step": 2064500 + }, + { + "epoch": 0.6419353184107982, + "grad_norm": 9.884620666503906, + "learning_rate": 3.9301078026486695e-05, + "loss": 2.8953, + "step": 2065000 + }, + { + "epoch": 0.6420907506912851, + "grad_norm": 12.90123176574707, + "learning_rate": 3.929848748847859e-05, + "loss": 2.9368, + "step": 2065500 + }, + { + "epoch": 0.6422461829717719, + "grad_norm": 8.186291694641113, + "learning_rate": 3.929589695047047e-05, + "loss": 2.9396, + "step": 2066000 + }, + { + "epoch": 0.6424016152522588, + "grad_norm": 11.558040618896484, + "learning_rate": 3.929330641246236e-05, + "loss": 2.9954, + "step": 2066500 + }, + { + "epoch": 0.6425570475327457, + "grad_norm": 9.461678504943848, + "learning_rate": 3.9290715874454244e-05, + "loss": 2.9547, + "step": 2067000 + }, + { + "epoch": 0.6427124798132325, + "grad_norm": 15.57570743560791, + "learning_rate": 3.9288125336446124e-05, + "loss": 2.942, + "step": 2067500 + }, + { + "epoch": 0.6428679120937194, + "grad_norm": 9.002077102661133, + "learning_rate": 3.928553479843801e-05, + "loss": 2.9376, + "step": 2068000 + }, + { + "epoch": 0.6430233443742063, + "grad_norm": 10.186960220336914, + "learning_rate": 3.92829442604299e-05, + "loss": 3.0038, + "step": 2068500 + }, + { + "epoch": 0.6431787766546933, + "grad_norm": 7.889993667602539, + "learning_rate": 3.928035372242178e-05, + "loss": 2.9377, + "step": 2069000 + }, + { + "epoch": 0.6433342089351801, + "grad_norm": 9.300683975219727, + "learning_rate": 3.9277763184413666e-05, + "loss": 2.9466, + "step": 2069500 + }, + { + "epoch": 0.643489641215667, + "grad_norm": 7.945773601531982, + "learning_rate": 3.927517264640555e-05, + "loss": 2.9548, + "step": 2070000 + }, + { + "epoch": 0.6436450734961539, + "grad_norm": 7.667304515838623, + "learning_rate": 3.927258210839744e-05, + "loss": 2.945, + "step": 2070500 + }, + { + "epoch": 0.6438005057766407, + "grad_norm": 6.977916240692139, + "learning_rate": 3.926999157038933e-05, + "loss": 2.9353, + "step": 2071000 + }, + { + "epoch": 0.6439559380571276, + "grad_norm": 8.8089017868042, + "learning_rate": 3.926740103238121e-05, + "loss": 2.9435, + "step": 2071500 + }, + { + "epoch": 0.6441113703376145, + "grad_norm": 9.246885299682617, + "learning_rate": 3.9264810494373095e-05, + "loss": 2.9362, + "step": 2072000 + }, + { + "epoch": 0.6442668026181013, + "grad_norm": 8.965583801269531, + "learning_rate": 3.926221995636498e-05, + "loss": 2.9517, + "step": 2072500 + }, + { + "epoch": 0.6444222348985882, + "grad_norm": 10.993040084838867, + "learning_rate": 3.925962941835686e-05, + "loss": 2.9506, + "step": 2073000 + }, + { + "epoch": 0.644577667179075, + "grad_norm": 9.14529800415039, + "learning_rate": 3.925703888034875e-05, + "loss": 2.9347, + "step": 2073500 + }, + { + "epoch": 0.6447330994595619, + "grad_norm": 9.176896095275879, + "learning_rate": 3.925444834234064e-05, + "loss": 2.942, + "step": 2074000 + }, + { + "epoch": 0.6448885317400488, + "grad_norm": 9.432635307312012, + "learning_rate": 3.925185780433252e-05, + "loss": 2.9246, + "step": 2074500 + }, + { + "epoch": 0.6450439640205358, + "grad_norm": 8.25409984588623, + "learning_rate": 3.9249267266324404e-05, + "loss": 2.9518, + "step": 2075000 + }, + { + "epoch": 0.6451993963010226, + "grad_norm": 17.316099166870117, + "learning_rate": 3.924667672831629e-05, + "loss": 2.9141, + "step": 2075500 + }, + { + "epoch": 0.6453548285815095, + "grad_norm": 8.708656311035156, + "learning_rate": 3.924408619030818e-05, + "loss": 2.9604, + "step": 2076000 + }, + { + "epoch": 0.6455102608619964, + "grad_norm": 7.812131881713867, + "learning_rate": 3.9241495652300066e-05, + "loss": 2.9714, + "step": 2076500 + }, + { + "epoch": 0.6456656931424832, + "grad_norm": 8.19330883026123, + "learning_rate": 3.9238905114291946e-05, + "loss": 2.954, + "step": 2077000 + }, + { + "epoch": 0.6458211254229701, + "grad_norm": 31.239734649658203, + "learning_rate": 3.923631457628383e-05, + "loss": 2.9529, + "step": 2077500 + }, + { + "epoch": 0.645976557703457, + "grad_norm": 8.476414680480957, + "learning_rate": 3.923372403827572e-05, + "loss": 2.9856, + "step": 2078000 + }, + { + "epoch": 0.6461319899839438, + "grad_norm": 10.805356979370117, + "learning_rate": 3.92311335002676e-05, + "loss": 2.8915, + "step": 2078500 + }, + { + "epoch": 0.6462874222644307, + "grad_norm": 8.019328117370605, + "learning_rate": 3.922854296225949e-05, + "loss": 2.9941, + "step": 2079000 + }, + { + "epoch": 0.6464428545449176, + "grad_norm": 8.526105880737305, + "learning_rate": 3.9225952424251375e-05, + "loss": 2.9144, + "step": 2079500 + }, + { + "epoch": 0.6465982868254044, + "grad_norm": 7.795298099517822, + "learning_rate": 3.922336188624326e-05, + "loss": 2.9495, + "step": 2080000 + }, + { + "epoch": 0.6467537191058913, + "grad_norm": 10.087972640991211, + "learning_rate": 3.922077134823515e-05, + "loss": 2.9778, + "step": 2080500 + }, + { + "epoch": 0.6469091513863783, + "grad_norm": 22.136507034301758, + "learning_rate": 3.921818081022703e-05, + "loss": 2.927, + "step": 2081000 + }, + { + "epoch": 0.6470645836668651, + "grad_norm": 8.726430892944336, + "learning_rate": 3.921559027221892e-05, + "loss": 2.9335, + "step": 2081500 + }, + { + "epoch": 0.647220015947352, + "grad_norm": 11.009544372558594, + "learning_rate": 3.9212999734210804e-05, + "loss": 2.9534, + "step": 2082000 + }, + { + "epoch": 0.6473754482278389, + "grad_norm": 8.544248580932617, + "learning_rate": 3.9210409196202685e-05, + "loss": 2.95, + "step": 2082500 + }, + { + "epoch": 0.6475308805083257, + "grad_norm": 9.008464813232422, + "learning_rate": 3.920781865819457e-05, + "loss": 2.9393, + "step": 2083000 + }, + { + "epoch": 0.6476863127888126, + "grad_norm": 10.763798713684082, + "learning_rate": 3.920522812018646e-05, + "loss": 2.9386, + "step": 2083500 + }, + { + "epoch": 0.6478417450692995, + "grad_norm": 8.858410835266113, + "learning_rate": 3.920263758217834e-05, + "loss": 2.9196, + "step": 2084000 + }, + { + "epoch": 0.6479971773497863, + "grad_norm": 8.042263984680176, + "learning_rate": 3.9200047044170226e-05, + "loss": 2.9545, + "step": 2084500 + }, + { + "epoch": 0.6481526096302732, + "grad_norm": 7.918876647949219, + "learning_rate": 3.9197456506162114e-05, + "loss": 2.9513, + "step": 2085000 + }, + { + "epoch": 0.6483080419107601, + "grad_norm": 10.484896659851074, + "learning_rate": 3.9194865968154e-05, + "loss": 2.9393, + "step": 2085500 + }, + { + "epoch": 0.648463474191247, + "grad_norm": 9.20781421661377, + "learning_rate": 3.919227543014589e-05, + "loss": 2.9827, + "step": 2086000 + }, + { + "epoch": 0.6486189064717338, + "grad_norm": 9.48405647277832, + "learning_rate": 3.9189684892137775e-05, + "loss": 2.9714, + "step": 2086500 + }, + { + "epoch": 0.6487743387522208, + "grad_norm": 8.126235961914062, + "learning_rate": 3.9187094354129655e-05, + "loss": 2.9942, + "step": 2087000 + }, + { + "epoch": 0.6489297710327077, + "grad_norm": 8.342126846313477, + "learning_rate": 3.918450381612154e-05, + "loss": 2.9417, + "step": 2087500 + }, + { + "epoch": 0.6490852033131945, + "grad_norm": 8.415104866027832, + "learning_rate": 3.918191327811342e-05, + "loss": 2.9626, + "step": 2088000 + }, + { + "epoch": 0.6492406355936814, + "grad_norm": 6.20131254196167, + "learning_rate": 3.917932274010531e-05, + "loss": 2.9407, + "step": 2088500 + }, + { + "epoch": 0.6493960678741683, + "grad_norm": 8.293252944946289, + "learning_rate": 3.91767322020972e-05, + "loss": 2.944, + "step": 2089000 + }, + { + "epoch": 0.6495515001546551, + "grad_norm": 26.28577995300293, + "learning_rate": 3.9174141664089084e-05, + "loss": 2.8761, + "step": 2089500 + }, + { + "epoch": 0.649706932435142, + "grad_norm": 8.935319900512695, + "learning_rate": 3.917155112608097e-05, + "loss": 3.0113, + "step": 2090000 + }, + { + "epoch": 0.6498623647156289, + "grad_norm": 5.666107654571533, + "learning_rate": 3.916896058807286e-05, + "loss": 2.998, + "step": 2090500 + }, + { + "epoch": 0.6500177969961157, + "grad_norm": 9.978854179382324, + "learning_rate": 3.916637005006474e-05, + "loss": 2.9604, + "step": 2091000 + }, + { + "epoch": 0.6501732292766026, + "grad_norm": 8.272004127502441, + "learning_rate": 3.9163779512056626e-05, + "loss": 2.9719, + "step": 2091500 + }, + { + "epoch": 0.6503286615570895, + "grad_norm": 8.051385879516602, + "learning_rate": 3.916118897404851e-05, + "loss": 2.9626, + "step": 2092000 + }, + { + "epoch": 0.6504840938375763, + "grad_norm": 11.149168014526367, + "learning_rate": 3.9158598436040394e-05, + "loss": 2.9209, + "step": 2092500 + }, + { + "epoch": 0.6506395261180633, + "grad_norm": 8.485291481018066, + "learning_rate": 3.915600789803228e-05, + "loss": 2.9463, + "step": 2093000 + }, + { + "epoch": 0.6507949583985502, + "grad_norm": 8.489028930664062, + "learning_rate": 3.915341736002416e-05, + "loss": 2.9015, + "step": 2093500 + }, + { + "epoch": 0.650950390679037, + "grad_norm": 7.955577850341797, + "learning_rate": 3.915082682201605e-05, + "loss": 2.9781, + "step": 2094000 + }, + { + "epoch": 0.6511058229595239, + "grad_norm": 9.7916898727417, + "learning_rate": 3.9148236284007936e-05, + "loss": 2.9312, + "step": 2094500 + }, + { + "epoch": 0.6512612552400108, + "grad_norm": 17.6362361907959, + "learning_rate": 3.914564574599982e-05, + "loss": 2.9506, + "step": 2095000 + }, + { + "epoch": 0.6514166875204976, + "grad_norm": 13.223877906799316, + "learning_rate": 3.914305520799171e-05, + "loss": 2.9428, + "step": 2095500 + }, + { + "epoch": 0.6515721198009845, + "grad_norm": 7.061729431152344, + "learning_rate": 3.91404646699836e-05, + "loss": 2.9576, + "step": 2096000 + }, + { + "epoch": 0.6517275520814714, + "grad_norm": 11.769634246826172, + "learning_rate": 3.913787413197548e-05, + "loss": 2.9668, + "step": 2096500 + }, + { + "epoch": 0.6518829843619582, + "grad_norm": 6.576639652252197, + "learning_rate": 3.9135283593967365e-05, + "loss": 2.9972, + "step": 2097000 + }, + { + "epoch": 0.6520384166424451, + "grad_norm": 21.041664123535156, + "learning_rate": 3.913269305595925e-05, + "loss": 2.8813, + "step": 2097500 + }, + { + "epoch": 0.652193848922932, + "grad_norm": 11.062896728515625, + "learning_rate": 3.913010251795113e-05, + "loss": 2.9662, + "step": 2098000 + }, + { + "epoch": 0.6523492812034188, + "grad_norm": 7.981196403503418, + "learning_rate": 3.912751197994302e-05, + "loss": 2.9937, + "step": 2098500 + }, + { + "epoch": 0.6525047134839058, + "grad_norm": 9.555536270141602, + "learning_rate": 3.9124921441934906e-05, + "loss": 2.9471, + "step": 2099000 + }, + { + "epoch": 0.6526601457643927, + "grad_norm": 16.25929832458496, + "learning_rate": 3.9122330903926794e-05, + "loss": 2.9438, + "step": 2099500 + }, + { + "epoch": 0.6528155780448796, + "grad_norm": 9.283909797668457, + "learning_rate": 3.911974036591868e-05, + "loss": 2.9684, + "step": 2100000 + }, + { + "epoch": 0.6529710103253664, + "grad_norm": 10.907447814941406, + "learning_rate": 3.911714982791056e-05, + "loss": 2.9726, + "step": 2100500 + }, + { + "epoch": 0.6531264426058533, + "grad_norm": 8.45775032043457, + "learning_rate": 3.911455928990245e-05, + "loss": 2.9823, + "step": 2101000 + }, + { + "epoch": 0.6532818748863402, + "grad_norm": 8.17696475982666, + "learning_rate": 3.9111968751894335e-05, + "loss": 2.885, + "step": 2101500 + }, + { + "epoch": 0.653437307166827, + "grad_norm": 8.850557327270508, + "learning_rate": 3.9109378213886216e-05, + "loss": 2.9267, + "step": 2102000 + }, + { + "epoch": 0.6535927394473139, + "grad_norm": 8.06379222869873, + "learning_rate": 3.91067876758781e-05, + "loss": 2.9433, + "step": 2102500 + }, + { + "epoch": 0.6537481717278008, + "grad_norm": 9.022443771362305, + "learning_rate": 3.910419713786999e-05, + "loss": 2.9262, + "step": 2103000 + }, + { + "epoch": 0.6539036040082876, + "grad_norm": 7.22254753112793, + "learning_rate": 3.910160659986187e-05, + "loss": 2.9646, + "step": 2103500 + }, + { + "epoch": 0.6540590362887745, + "grad_norm": 13.45378589630127, + "learning_rate": 3.909901606185376e-05, + "loss": 2.9114, + "step": 2104000 + }, + { + "epoch": 0.6542144685692614, + "grad_norm": 8.779699325561523, + "learning_rate": 3.9096425523845645e-05, + "loss": 2.9677, + "step": 2104500 + }, + { + "epoch": 0.6543699008497483, + "grad_norm": 9.919974327087402, + "learning_rate": 3.909383498583753e-05, + "loss": 2.9258, + "step": 2105000 + }, + { + "epoch": 0.6545253331302352, + "grad_norm": 9.883556365966797, + "learning_rate": 3.909124444782942e-05, + "loss": 2.9298, + "step": 2105500 + }, + { + "epoch": 0.6546807654107221, + "grad_norm": 9.912273406982422, + "learning_rate": 3.90886539098213e-05, + "loss": 2.9221, + "step": 2106000 + }, + { + "epoch": 0.6548361976912089, + "grad_norm": 7.3022918701171875, + "learning_rate": 3.908606337181319e-05, + "loss": 2.9818, + "step": 2106500 + }, + { + "epoch": 0.6549916299716958, + "grad_norm": 7.756597995758057, + "learning_rate": 3.9083472833805074e-05, + "loss": 2.9386, + "step": 2107000 + }, + { + "epoch": 0.6551470622521827, + "grad_norm": 8.278937339782715, + "learning_rate": 3.9080882295796954e-05, + "loss": 2.9369, + "step": 2107500 + }, + { + "epoch": 0.6553024945326695, + "grad_norm": 18.088939666748047, + "learning_rate": 3.907829175778884e-05, + "loss": 2.9202, + "step": 2108000 + }, + { + "epoch": 0.6554579268131564, + "grad_norm": 12.397265434265137, + "learning_rate": 3.907570121978073e-05, + "loss": 2.963, + "step": 2108500 + }, + { + "epoch": 0.6556133590936433, + "grad_norm": 5.219832897186279, + "learning_rate": 3.9073110681772616e-05, + "loss": 2.9667, + "step": 2109000 + }, + { + "epoch": 0.6557687913741301, + "grad_norm": 8.883143424987793, + "learning_rate": 3.90705201437645e-05, + "loss": 2.9357, + "step": 2109500 + }, + { + "epoch": 0.655924223654617, + "grad_norm": 7.454909801483154, + "learning_rate": 3.906792960575639e-05, + "loss": 2.9796, + "step": 2110000 + }, + { + "epoch": 0.6560796559351039, + "grad_norm": 8.778684616088867, + "learning_rate": 3.906533906774827e-05, + "loss": 2.9409, + "step": 2110500 + }, + { + "epoch": 0.6562350882155908, + "grad_norm": 8.88008975982666, + "learning_rate": 3.906274852974016e-05, + "loss": 2.9229, + "step": 2111000 + }, + { + "epoch": 0.6563905204960777, + "grad_norm": 6.515708923339844, + "learning_rate": 3.906015799173204e-05, + "loss": 2.9889, + "step": 2111500 + }, + { + "epoch": 0.6565459527765646, + "grad_norm": 8.025697708129883, + "learning_rate": 3.9057567453723925e-05, + "loss": 2.9359, + "step": 2112000 + }, + { + "epoch": 0.6567013850570514, + "grad_norm": 9.947436332702637, + "learning_rate": 3.905497691571581e-05, + "loss": 2.9615, + "step": 2112500 + }, + { + "epoch": 0.6568568173375383, + "grad_norm": 9.661087036132812, + "learning_rate": 3.905238637770769e-05, + "loss": 2.9344, + "step": 2113000 + }, + { + "epoch": 0.6570122496180252, + "grad_norm": 7.826178550720215, + "learning_rate": 3.904979583969958e-05, + "loss": 2.9004, + "step": 2113500 + }, + { + "epoch": 0.657167681898512, + "grad_norm": 18.29315948486328, + "learning_rate": 3.904720530169147e-05, + "loss": 2.9831, + "step": 2114000 + }, + { + "epoch": 0.6573231141789989, + "grad_norm": 14.204398155212402, + "learning_rate": 3.9044614763683354e-05, + "loss": 2.9413, + "step": 2114500 + }, + { + "epoch": 0.6574785464594858, + "grad_norm": 8.310688018798828, + "learning_rate": 3.904202422567524e-05, + "loss": 2.9593, + "step": 2115000 + }, + { + "epoch": 0.6576339787399726, + "grad_norm": 9.683573722839355, + "learning_rate": 3.903943368766713e-05, + "loss": 2.9071, + "step": 2115500 + }, + { + "epoch": 0.6577894110204595, + "grad_norm": 16.300573348999023, + "learning_rate": 3.903684314965901e-05, + "loss": 2.959, + "step": 2116000 + }, + { + "epoch": 0.6579448433009464, + "grad_norm": 8.205500602722168, + "learning_rate": 3.9034252611650896e-05, + "loss": 2.9353, + "step": 2116500 + }, + { + "epoch": 0.6581002755814334, + "grad_norm": 8.180908203125, + "learning_rate": 3.9031662073642776e-05, + "loss": 2.9394, + "step": 2117000 + }, + { + "epoch": 0.6582557078619202, + "grad_norm": 7.194463729858398, + "learning_rate": 3.902907153563466e-05, + "loss": 2.9469, + "step": 2117500 + }, + { + "epoch": 0.6584111401424071, + "grad_norm": 8.762369155883789, + "learning_rate": 3.902648099762655e-05, + "loss": 2.9108, + "step": 2118000 + }, + { + "epoch": 0.658566572422894, + "grad_norm": 10.357124328613281, + "learning_rate": 3.902389045961843e-05, + "loss": 2.9394, + "step": 2118500 + }, + { + "epoch": 0.6587220047033808, + "grad_norm": 7.257431507110596, + "learning_rate": 3.9021299921610325e-05, + "loss": 2.9547, + "step": 2119000 + }, + { + "epoch": 0.6588774369838677, + "grad_norm": 8.576791763305664, + "learning_rate": 3.901870938360221e-05, + "loss": 2.9107, + "step": 2119500 + }, + { + "epoch": 0.6590328692643546, + "grad_norm": 8.984113693237305, + "learning_rate": 3.901611884559409e-05, + "loss": 2.943, + "step": 2120000 + }, + { + "epoch": 0.6591883015448414, + "grad_norm": 7.369329929351807, + "learning_rate": 3.901352830758598e-05, + "loss": 2.9556, + "step": 2120500 + }, + { + "epoch": 0.6593437338253283, + "grad_norm": 7.681082248687744, + "learning_rate": 3.901093776957787e-05, + "loss": 2.9387, + "step": 2121000 + }, + { + "epoch": 0.6594991661058152, + "grad_norm": 15.721251487731934, + "learning_rate": 3.900834723156975e-05, + "loss": 2.9959, + "step": 2121500 + }, + { + "epoch": 0.659654598386302, + "grad_norm": 9.586002349853516, + "learning_rate": 3.9005756693561634e-05, + "loss": 2.9349, + "step": 2122000 + }, + { + "epoch": 0.6598100306667889, + "grad_norm": 8.44922161102295, + "learning_rate": 3.900316615555352e-05, + "loss": 2.9596, + "step": 2122500 + }, + { + "epoch": 0.6599654629472759, + "grad_norm": 45.9738655090332, + "learning_rate": 3.90005756175454e-05, + "loss": 2.9391, + "step": 2123000 + }, + { + "epoch": 0.6601208952277627, + "grad_norm": 10.264196395874023, + "learning_rate": 3.899798507953729e-05, + "loss": 2.9295, + "step": 2123500 + }, + { + "epoch": 0.6602763275082496, + "grad_norm": 10.210783958435059, + "learning_rate": 3.8995394541529176e-05, + "loss": 2.9177, + "step": 2124000 + }, + { + "epoch": 0.6604317597887365, + "grad_norm": 17.014415740966797, + "learning_rate": 3.899280400352106e-05, + "loss": 2.9455, + "step": 2124500 + }, + { + "epoch": 0.6605871920692233, + "grad_norm": 35.252464294433594, + "learning_rate": 3.899021346551295e-05, + "loss": 2.965, + "step": 2125000 + }, + { + "epoch": 0.6607426243497102, + "grad_norm": 8.136777877807617, + "learning_rate": 3.898762292750483e-05, + "loss": 2.9216, + "step": 2125500 + }, + { + "epoch": 0.6608980566301971, + "grad_norm": 10.133270263671875, + "learning_rate": 3.898503238949672e-05, + "loss": 2.9089, + "step": 2126000 + }, + { + "epoch": 0.6610534889106839, + "grad_norm": 9.187017440795898, + "learning_rate": 3.8982441851488605e-05, + "loss": 2.9437, + "step": 2126500 + }, + { + "epoch": 0.6612089211911708, + "grad_norm": 9.602163314819336, + "learning_rate": 3.8979851313480485e-05, + "loss": 2.9902, + "step": 2127000 + }, + { + "epoch": 0.6613643534716577, + "grad_norm": 9.022848129272461, + "learning_rate": 3.897726077547237e-05, + "loss": 2.918, + "step": 2127500 + }, + { + "epoch": 0.6615197857521445, + "grad_norm": 8.511088371276855, + "learning_rate": 3.897467023746426e-05, + "loss": 2.9241, + "step": 2128000 + }, + { + "epoch": 0.6616752180326314, + "grad_norm": 7.660780906677246, + "learning_rate": 3.897207969945614e-05, + "loss": 2.9665, + "step": 2128500 + }, + { + "epoch": 0.6618306503131183, + "grad_norm": 10.065557479858398, + "learning_rate": 3.8969489161448034e-05, + "loss": 2.95, + "step": 2129000 + }, + { + "epoch": 0.6619860825936053, + "grad_norm": 12.738420486450195, + "learning_rate": 3.8966898623439914e-05, + "loss": 2.9712, + "step": 2129500 + }, + { + "epoch": 0.6621415148740921, + "grad_norm": 12.339788436889648, + "learning_rate": 3.89643080854318e-05, + "loss": 2.94, + "step": 2130000 + }, + { + "epoch": 0.662296947154579, + "grad_norm": 7.546443462371826, + "learning_rate": 3.896171754742369e-05, + "loss": 2.9903, + "step": 2130500 + }, + { + "epoch": 0.6624523794350659, + "grad_norm": 8.286978721618652, + "learning_rate": 3.895912700941557e-05, + "loss": 2.9505, + "step": 2131000 + }, + { + "epoch": 0.6626078117155527, + "grad_norm": 7.90731143951416, + "learning_rate": 3.8956536471407456e-05, + "loss": 2.959, + "step": 2131500 + }, + { + "epoch": 0.6627632439960396, + "grad_norm": 9.760531425476074, + "learning_rate": 3.895394593339934e-05, + "loss": 2.9131, + "step": 2132000 + }, + { + "epoch": 0.6629186762765265, + "grad_norm": 5.683739185333252, + "learning_rate": 3.8951355395391224e-05, + "loss": 2.985, + "step": 2132500 + }, + { + "epoch": 0.6630741085570133, + "grad_norm": 10.837054252624512, + "learning_rate": 3.894876485738311e-05, + "loss": 2.9551, + "step": 2133000 + }, + { + "epoch": 0.6632295408375002, + "grad_norm": 9.163061141967773, + "learning_rate": 3.8946174319375e-05, + "loss": 2.9871, + "step": 2133500 + }, + { + "epoch": 0.663384973117987, + "grad_norm": 12.26524543762207, + "learning_rate": 3.8943583781366885e-05, + "loss": 2.9939, + "step": 2134000 + }, + { + "epoch": 0.6635404053984739, + "grad_norm": 10.59652042388916, + "learning_rate": 3.894099324335877e-05, + "loss": 2.9314, + "step": 2134500 + }, + { + "epoch": 0.6636958376789608, + "grad_norm": 10.550795555114746, + "learning_rate": 3.893840270535066e-05, + "loss": 2.8701, + "step": 2135000 + }, + { + "epoch": 0.6638512699594478, + "grad_norm": 7.584274768829346, + "learning_rate": 3.893581216734254e-05, + "loss": 2.9287, + "step": 2135500 + }, + { + "epoch": 0.6640067022399346, + "grad_norm": 10.02169418334961, + "learning_rate": 3.893322162933443e-05, + "loss": 2.9327, + "step": 2136000 + }, + { + "epoch": 0.6641621345204215, + "grad_norm": 9.053428649902344, + "learning_rate": 3.893063109132631e-05, + "loss": 2.9458, + "step": 2136500 + }, + { + "epoch": 0.6643175668009084, + "grad_norm": 7.962681293487549, + "learning_rate": 3.8928040553318195e-05, + "loss": 2.9309, + "step": 2137000 + }, + { + "epoch": 0.6644729990813952, + "grad_norm": 8.202290534973145, + "learning_rate": 3.892545001531008e-05, + "loss": 2.96, + "step": 2137500 + }, + { + "epoch": 0.6646284313618821, + "grad_norm": 24.518535614013672, + "learning_rate": 3.892285947730196e-05, + "loss": 2.9421, + "step": 2138000 + }, + { + "epoch": 0.664783863642369, + "grad_norm": 7.810785293579102, + "learning_rate": 3.892026893929385e-05, + "loss": 2.9499, + "step": 2138500 + }, + { + "epoch": 0.6649392959228558, + "grad_norm": 7.025989055633545, + "learning_rate": 3.891767840128574e-05, + "loss": 2.9298, + "step": 2139000 + }, + { + "epoch": 0.6650947282033427, + "grad_norm": 9.131606101989746, + "learning_rate": 3.8915087863277624e-05, + "loss": 2.9472, + "step": 2139500 + }, + { + "epoch": 0.6652501604838296, + "grad_norm": 8.280111312866211, + "learning_rate": 3.891249732526951e-05, + "loss": 2.9333, + "step": 2140000 + }, + { + "epoch": 0.6654055927643164, + "grad_norm": 10.022845268249512, + "learning_rate": 3.89099067872614e-05, + "loss": 2.9371, + "step": 2140500 + }, + { + "epoch": 0.6655610250448033, + "grad_norm": 10.670104026794434, + "learning_rate": 3.890731624925328e-05, + "loss": 2.9525, + "step": 2141000 + }, + { + "epoch": 0.6657164573252903, + "grad_norm": 8.421952247619629, + "learning_rate": 3.8904725711245165e-05, + "loss": 2.9523, + "step": 2141500 + }, + { + "epoch": 0.6658718896057771, + "grad_norm": 8.451003074645996, + "learning_rate": 3.8902135173237046e-05, + "loss": 2.9681, + "step": 2142000 + }, + { + "epoch": 0.666027321886264, + "grad_norm": 8.07363510131836, + "learning_rate": 3.889954463522893e-05, + "loss": 2.9302, + "step": 2142500 + }, + { + "epoch": 0.6661827541667509, + "grad_norm": 7.611475944519043, + "learning_rate": 3.889695409722082e-05, + "loss": 2.9738, + "step": 2143000 + }, + { + "epoch": 0.6663381864472377, + "grad_norm": 8.165590286254883, + "learning_rate": 3.889436355921271e-05, + "loss": 2.9305, + "step": 2143500 + }, + { + "epoch": 0.6664936187277246, + "grad_norm": 8.033221244812012, + "learning_rate": 3.8891773021204594e-05, + "loss": 2.9856, + "step": 2144000 + }, + { + "epoch": 0.6666490510082115, + "grad_norm": 7.805638790130615, + "learning_rate": 3.888918248319648e-05, + "loss": 2.9809, + "step": 2144500 + }, + { + "epoch": 0.6668044832886983, + "grad_norm": 7.704632759094238, + "learning_rate": 3.888659194518836e-05, + "loss": 2.9629, + "step": 2145000 + }, + { + "epoch": 0.6669599155691852, + "grad_norm": 22.280635833740234, + "learning_rate": 3.888400140718025e-05, + "loss": 2.9252, + "step": 2145500 + }, + { + "epoch": 0.6671153478496721, + "grad_norm": 7.62693452835083, + "learning_rate": 3.8881410869172136e-05, + "loss": 2.9294, + "step": 2146000 + }, + { + "epoch": 0.667270780130159, + "grad_norm": 9.292804718017578, + "learning_rate": 3.8878820331164017e-05, + "loss": 2.9609, + "step": 2146500 + }, + { + "epoch": 0.6674262124106458, + "grad_norm": 13.86976432800293, + "learning_rate": 3.8876229793155904e-05, + "loss": 2.9247, + "step": 2147000 + }, + { + "epoch": 0.6675816446911328, + "grad_norm": 8.203356742858887, + "learning_rate": 3.8873639255147784e-05, + "loss": 2.9147, + "step": 2147500 + }, + { + "epoch": 0.6677370769716197, + "grad_norm": 9.61868667602539, + "learning_rate": 3.887104871713967e-05, + "loss": 2.9377, + "step": 2148000 + }, + { + "epoch": 0.6678925092521065, + "grad_norm": 7.66645622253418, + "learning_rate": 3.886845817913156e-05, + "loss": 2.9023, + "step": 2148500 + }, + { + "epoch": 0.6680479415325934, + "grad_norm": 12.143319129943848, + "learning_rate": 3.8865867641123446e-05, + "loss": 2.9533, + "step": 2149000 + }, + { + "epoch": 0.6682033738130803, + "grad_norm": 9.470582008361816, + "learning_rate": 3.886327710311533e-05, + "loss": 3.0126, + "step": 2149500 + }, + { + "epoch": 0.6683588060935671, + "grad_norm": 8.62056827545166, + "learning_rate": 3.886068656510722e-05, + "loss": 2.9417, + "step": 2150000 + }, + { + "epoch": 0.668514238374054, + "grad_norm": 11.71785831451416, + "learning_rate": 3.88580960270991e-05, + "loss": 2.9234, + "step": 2150500 + }, + { + "epoch": 0.6686696706545409, + "grad_norm": 10.64547061920166, + "learning_rate": 3.885550548909099e-05, + "loss": 2.9207, + "step": 2151000 + }, + { + "epoch": 0.6688251029350277, + "grad_norm": 20.152568817138672, + "learning_rate": 3.8852914951082875e-05, + "loss": 2.9472, + "step": 2151500 + }, + { + "epoch": 0.6689805352155146, + "grad_norm": 7.574639320373535, + "learning_rate": 3.8850324413074755e-05, + "loss": 2.9588, + "step": 2152000 + }, + { + "epoch": 0.6691359674960015, + "grad_norm": 6.441576957702637, + "learning_rate": 3.884773387506664e-05, + "loss": 2.9133, + "step": 2152500 + }, + { + "epoch": 0.6692913997764883, + "grad_norm": 6.950446605682373, + "learning_rate": 3.884514333705853e-05, + "loss": 2.9513, + "step": 2153000 + }, + { + "epoch": 0.6694468320569753, + "grad_norm": 10.965075492858887, + "learning_rate": 3.8842552799050416e-05, + "loss": 2.9191, + "step": 2153500 + }, + { + "epoch": 0.6696022643374622, + "grad_norm": 32.91642761230469, + "learning_rate": 3.8839962261042304e-05, + "loss": 2.9582, + "step": 2154000 + }, + { + "epoch": 0.669757696617949, + "grad_norm": 7.264535427093506, + "learning_rate": 3.8837371723034184e-05, + "loss": 2.9411, + "step": 2154500 + }, + { + "epoch": 0.6699131288984359, + "grad_norm": 11.607789993286133, + "learning_rate": 3.883478118502607e-05, + "loss": 2.977, + "step": 2155000 + }, + { + "epoch": 0.6700685611789228, + "grad_norm": 6.883111953735352, + "learning_rate": 3.883219064701796e-05, + "loss": 2.9761, + "step": 2155500 + }, + { + "epoch": 0.6702239934594096, + "grad_norm": 7.814380168914795, + "learning_rate": 3.882960010900984e-05, + "loss": 2.9403, + "step": 2156000 + }, + { + "epoch": 0.6703794257398965, + "grad_norm": 6.489379405975342, + "learning_rate": 3.8827009571001726e-05, + "loss": 2.9802, + "step": 2156500 + }, + { + "epoch": 0.6705348580203834, + "grad_norm": 7.668713569641113, + "learning_rate": 3.882441903299361e-05, + "loss": 2.9672, + "step": 2157000 + }, + { + "epoch": 0.6706902903008702, + "grad_norm": 10.103001594543457, + "learning_rate": 3.882182849498549e-05, + "loss": 2.972, + "step": 2157500 + }, + { + "epoch": 0.6708457225813571, + "grad_norm": 10.08827018737793, + "learning_rate": 3.881923795697738e-05, + "loss": 2.862, + "step": 2158000 + }, + { + "epoch": 0.671001154861844, + "grad_norm": 7.833454608917236, + "learning_rate": 3.881664741896927e-05, + "loss": 2.9274, + "step": 2158500 + }, + { + "epoch": 0.6711565871423308, + "grad_norm": 7.688326835632324, + "learning_rate": 3.8814056880961155e-05, + "loss": 2.9403, + "step": 2159000 + }, + { + "epoch": 0.6713120194228178, + "grad_norm": 9.034757614135742, + "learning_rate": 3.881146634295304e-05, + "loss": 2.9557, + "step": 2159500 + }, + { + "epoch": 0.6714674517033047, + "grad_norm": 28.209957122802734, + "learning_rate": 3.880887580494492e-05, + "loss": 2.9731, + "step": 2160000 + }, + { + "epoch": 0.6716228839837916, + "grad_norm": 7.832675457000732, + "learning_rate": 3.880628526693681e-05, + "loss": 2.9883, + "step": 2160500 + }, + { + "epoch": 0.6717783162642784, + "grad_norm": 16.928390502929688, + "learning_rate": 3.8803694728928697e-05, + "loss": 2.9295, + "step": 2161000 + }, + { + "epoch": 0.6719337485447653, + "grad_norm": 6.870706081390381, + "learning_rate": 3.880110419092058e-05, + "loss": 2.9534, + "step": 2161500 + }, + { + "epoch": 0.6720891808252522, + "grad_norm": 8.432756423950195, + "learning_rate": 3.8798513652912464e-05, + "loss": 2.9836, + "step": 2162000 + }, + { + "epoch": 0.672244613105739, + "grad_norm": 23.635021209716797, + "learning_rate": 3.879592311490435e-05, + "loss": 2.9242, + "step": 2162500 + }, + { + "epoch": 0.6724000453862259, + "grad_norm": 7.889773368835449, + "learning_rate": 3.879333257689624e-05, + "loss": 2.942, + "step": 2163000 + }, + { + "epoch": 0.6725554776667128, + "grad_norm": 11.159156799316406, + "learning_rate": 3.8790742038888126e-05, + "loss": 2.9382, + "step": 2163500 + }, + { + "epoch": 0.6727109099471996, + "grad_norm": 11.330085754394531, + "learning_rate": 3.878815150088001e-05, + "loss": 2.9461, + "step": 2164000 + }, + { + "epoch": 0.6728663422276865, + "grad_norm": 8.66907787322998, + "learning_rate": 3.878556096287189e-05, + "loss": 2.9334, + "step": 2164500 + }, + { + "epoch": 0.6730217745081734, + "grad_norm": 7.263498306274414, + "learning_rate": 3.878297042486378e-05, + "loss": 2.9029, + "step": 2165000 + }, + { + "epoch": 0.6731772067886603, + "grad_norm": 7.793150901794434, + "learning_rate": 3.878037988685566e-05, + "loss": 2.9786, + "step": 2165500 + }, + { + "epoch": 0.6733326390691472, + "grad_norm": 99.58191680908203, + "learning_rate": 3.877778934884755e-05, + "loss": 2.8963, + "step": 2166000 + }, + { + "epoch": 0.6734880713496341, + "grad_norm": 9.08145523071289, + "learning_rate": 3.8775198810839435e-05, + "loss": 2.9104, + "step": 2166500 + }, + { + "epoch": 0.6736435036301209, + "grad_norm": 12.932756423950195, + "learning_rate": 3.8772608272831315e-05, + "loss": 2.9445, + "step": 2167000 + }, + { + "epoch": 0.6737989359106078, + "grad_norm": 6.8833417892456055, + "learning_rate": 3.87700177348232e-05, + "loss": 2.9341, + "step": 2167500 + }, + { + "epoch": 0.6739543681910947, + "grad_norm": 8.734052658081055, + "learning_rate": 3.876742719681509e-05, + "loss": 2.9081, + "step": 2168000 + }, + { + "epoch": 0.6741098004715815, + "grad_norm": 10.12293815612793, + "learning_rate": 3.876483665880698e-05, + "loss": 2.9267, + "step": 2168500 + }, + { + "epoch": 0.6742652327520684, + "grad_norm": 12.321205139160156, + "learning_rate": 3.8762246120798864e-05, + "loss": 2.9395, + "step": 2169000 + }, + { + "epoch": 0.6744206650325553, + "grad_norm": 27.455711364746094, + "learning_rate": 3.875965558279075e-05, + "loss": 2.921, + "step": 2169500 + }, + { + "epoch": 0.6745760973130421, + "grad_norm": 12.77207088470459, + "learning_rate": 3.875706504478263e-05, + "loss": 2.9249, + "step": 2170000 + }, + { + "epoch": 0.674731529593529, + "grad_norm": 35.619728088378906, + "learning_rate": 3.875447450677452e-05, + "loss": 2.9654, + "step": 2170500 + }, + { + "epoch": 0.6748869618740159, + "grad_norm": 10.603141784667969, + "learning_rate": 3.8751883968766406e-05, + "loss": 2.9673, + "step": 2171000 + }, + { + "epoch": 0.6750423941545028, + "grad_norm": 11.421242713928223, + "learning_rate": 3.8749293430758286e-05, + "loss": 2.9411, + "step": 2171500 + }, + { + "epoch": 0.6751978264349897, + "grad_norm": 6.775951862335205, + "learning_rate": 3.874670289275017e-05, + "loss": 2.9256, + "step": 2172000 + }, + { + "epoch": 0.6753532587154766, + "grad_norm": 8.731060028076172, + "learning_rate": 3.8744112354742054e-05, + "loss": 2.9539, + "step": 2172500 + }, + { + "epoch": 0.6755086909959634, + "grad_norm": 10.328300476074219, + "learning_rate": 3.874152181673395e-05, + "loss": 2.9508, + "step": 2173000 + }, + { + "epoch": 0.6756641232764503, + "grad_norm": 9.074698448181152, + "learning_rate": 3.8738931278725835e-05, + "loss": 2.9745, + "step": 2173500 + }, + { + "epoch": 0.6758195555569372, + "grad_norm": 11.847976684570312, + "learning_rate": 3.8736340740717715e-05, + "loss": 2.9238, + "step": 2174000 + }, + { + "epoch": 0.675974987837424, + "grad_norm": 9.332544326782227, + "learning_rate": 3.87337502027096e-05, + "loss": 2.9271, + "step": 2174500 + }, + { + "epoch": 0.6761304201179109, + "grad_norm": 28.812477111816406, + "learning_rate": 3.873115966470149e-05, + "loss": 3.0308, + "step": 2175000 + }, + { + "epoch": 0.6762858523983978, + "grad_norm": 8.408452987670898, + "learning_rate": 3.872856912669337e-05, + "loss": 2.9549, + "step": 2175500 + }, + { + "epoch": 0.6764412846788846, + "grad_norm": 10.037638664245605, + "learning_rate": 3.872597858868526e-05, + "loss": 2.9532, + "step": 2176000 + }, + { + "epoch": 0.6765967169593715, + "grad_norm": 9.774069786071777, + "learning_rate": 3.8723388050677144e-05, + "loss": 2.9479, + "step": 2176500 + }, + { + "epoch": 0.6767521492398584, + "grad_norm": 9.316040992736816, + "learning_rate": 3.8720797512669025e-05, + "loss": 2.9492, + "step": 2177000 + }, + { + "epoch": 0.6769075815203454, + "grad_norm": 8.78390884399414, + "learning_rate": 3.871820697466091e-05, + "loss": 2.96, + "step": 2177500 + }, + { + "epoch": 0.6770630138008322, + "grad_norm": 8.190272331237793, + "learning_rate": 3.87156164366528e-05, + "loss": 2.9226, + "step": 2178000 + }, + { + "epoch": 0.6772184460813191, + "grad_norm": 8.055292129516602, + "learning_rate": 3.8713025898644686e-05, + "loss": 2.938, + "step": 2178500 + }, + { + "epoch": 0.677373878361806, + "grad_norm": 7.605440616607666, + "learning_rate": 3.871043536063657e-05, + "loss": 2.9301, + "step": 2179000 + }, + { + "epoch": 0.6775293106422928, + "grad_norm": 9.18763256072998, + "learning_rate": 3.8707844822628454e-05, + "loss": 2.9674, + "step": 2179500 + }, + { + "epoch": 0.6776847429227797, + "grad_norm": 15.304665565490723, + "learning_rate": 3.870525428462034e-05, + "loss": 2.9707, + "step": 2180000 + }, + { + "epoch": 0.6778401752032666, + "grad_norm": 12.093280792236328, + "learning_rate": 3.870266374661223e-05, + "loss": 2.9772, + "step": 2180500 + }, + { + "epoch": 0.6779956074837534, + "grad_norm": 9.067160606384277, + "learning_rate": 3.870007320860411e-05, + "loss": 2.9153, + "step": 2181000 + }, + { + "epoch": 0.6781510397642403, + "grad_norm": 12.63831901550293, + "learning_rate": 3.8697482670595995e-05, + "loss": 2.9508, + "step": 2181500 + }, + { + "epoch": 0.6783064720447272, + "grad_norm": 9.244585990905762, + "learning_rate": 3.869489213258788e-05, + "loss": 2.9795, + "step": 2182000 + }, + { + "epoch": 0.678461904325214, + "grad_norm": 8.93962287902832, + "learning_rate": 3.869230159457976e-05, + "loss": 2.9469, + "step": 2182500 + }, + { + "epoch": 0.6786173366057009, + "grad_norm": 7.049583911895752, + "learning_rate": 3.868971105657166e-05, + "loss": 2.9423, + "step": 2183000 + }, + { + "epoch": 0.6787727688861879, + "grad_norm": 23.091251373291016, + "learning_rate": 3.868712051856354e-05, + "loss": 2.9382, + "step": 2183500 + }, + { + "epoch": 0.6789282011666747, + "grad_norm": 5.956478118896484, + "learning_rate": 3.8684529980555424e-05, + "loss": 2.937, + "step": 2184000 + }, + { + "epoch": 0.6790836334471616, + "grad_norm": 5.71224308013916, + "learning_rate": 3.868193944254731e-05, + "loss": 2.934, + "step": 2184500 + }, + { + "epoch": 0.6792390657276485, + "grad_norm": 8.32502555847168, + "learning_rate": 3.867934890453919e-05, + "loss": 2.9289, + "step": 2185000 + }, + { + "epoch": 0.6793944980081353, + "grad_norm": 18.295974731445312, + "learning_rate": 3.867675836653108e-05, + "loss": 2.9301, + "step": 2185500 + }, + { + "epoch": 0.6795499302886222, + "grad_norm": 6.4376630783081055, + "learning_rate": 3.8674167828522966e-05, + "loss": 2.9283, + "step": 2186000 + }, + { + "epoch": 0.6797053625691091, + "grad_norm": 12.50428295135498, + "learning_rate": 3.8671577290514847e-05, + "loss": 2.9434, + "step": 2186500 + }, + { + "epoch": 0.6798607948495959, + "grad_norm": 9.192538261413574, + "learning_rate": 3.8668986752506734e-05, + "loss": 2.9483, + "step": 2187000 + }, + { + "epoch": 0.6800162271300828, + "grad_norm": 7.8517351150512695, + "learning_rate": 3.866639621449862e-05, + "loss": 2.9454, + "step": 2187500 + }, + { + "epoch": 0.6801716594105697, + "grad_norm": 7.3408894538879395, + "learning_rate": 3.866380567649051e-05, + "loss": 2.9643, + "step": 2188000 + }, + { + "epoch": 0.6803270916910565, + "grad_norm": 8.76071548461914, + "learning_rate": 3.8661215138482395e-05, + "loss": 2.9145, + "step": 2188500 + }, + { + "epoch": 0.6804825239715434, + "grad_norm": 14.91977596282959, + "learning_rate": 3.865862460047428e-05, + "loss": 2.9684, + "step": 2189000 + }, + { + "epoch": 0.6806379562520304, + "grad_norm": 13.63682746887207, + "learning_rate": 3.865603406246616e-05, + "loss": 2.9681, + "step": 2189500 + }, + { + "epoch": 0.6807933885325173, + "grad_norm": 7.358198642730713, + "learning_rate": 3.865344352445805e-05, + "loss": 2.9263, + "step": 2190000 + }, + { + "epoch": 0.6809488208130041, + "grad_norm": 8.42491340637207, + "learning_rate": 3.865085298644993e-05, + "loss": 2.9551, + "step": 2190500 + }, + { + "epoch": 0.681104253093491, + "grad_norm": 8.595903396606445, + "learning_rate": 3.864826244844182e-05, + "loss": 2.9054, + "step": 2191000 + }, + { + "epoch": 0.6812596853739779, + "grad_norm": 7.962962627410889, + "learning_rate": 3.8645671910433705e-05, + "loss": 2.9445, + "step": 2191500 + }, + { + "epoch": 0.6814151176544647, + "grad_norm": 25.0757999420166, + "learning_rate": 3.8643081372425585e-05, + "loss": 2.9617, + "step": 2192000 + }, + { + "epoch": 0.6815705499349516, + "grad_norm": 8.613195419311523, + "learning_rate": 3.864049083441747e-05, + "loss": 2.9126, + "step": 2192500 + }, + { + "epoch": 0.6817259822154385, + "grad_norm": 8.356305122375488, + "learning_rate": 3.8637900296409366e-05, + "loss": 2.9145, + "step": 2193000 + }, + { + "epoch": 0.6818814144959253, + "grad_norm": 6.774901866912842, + "learning_rate": 3.8635309758401246e-05, + "loss": 2.92, + "step": 2193500 + }, + { + "epoch": 0.6820368467764122, + "grad_norm": 8.29477310180664, + "learning_rate": 3.8632719220393134e-05, + "loss": 2.9245, + "step": 2194000 + }, + { + "epoch": 0.682192279056899, + "grad_norm": 8.625246047973633, + "learning_rate": 3.863012868238502e-05, + "loss": 2.9188, + "step": 2194500 + }, + { + "epoch": 0.6823477113373859, + "grad_norm": 10.325197219848633, + "learning_rate": 3.86275381443769e-05, + "loss": 2.9102, + "step": 2195000 + }, + { + "epoch": 0.6825031436178729, + "grad_norm": 8.557430267333984, + "learning_rate": 3.862494760636879e-05, + "loss": 2.9689, + "step": 2195500 + }, + { + "epoch": 0.6826585758983598, + "grad_norm": 9.428094863891602, + "learning_rate": 3.862235706836067e-05, + "loss": 2.9322, + "step": 2196000 + }, + { + "epoch": 0.6828140081788466, + "grad_norm": 8.250252723693848, + "learning_rate": 3.8619766530352556e-05, + "loss": 2.9373, + "step": 2196500 + }, + { + "epoch": 0.6829694404593335, + "grad_norm": 8.137353897094727, + "learning_rate": 3.861717599234444e-05, + "loss": 2.9364, + "step": 2197000 + }, + { + "epoch": 0.6831248727398204, + "grad_norm": 15.704087257385254, + "learning_rate": 3.861458545433633e-05, + "loss": 2.9286, + "step": 2197500 + }, + { + "epoch": 0.6832803050203072, + "grad_norm": 14.821760177612305, + "learning_rate": 3.861199491632822e-05, + "loss": 2.9913, + "step": 2198000 + }, + { + "epoch": 0.6834357373007941, + "grad_norm": 9.54485034942627, + "learning_rate": 3.8609404378320104e-05, + "loss": 2.9455, + "step": 2198500 + }, + { + "epoch": 0.683591169581281, + "grad_norm": 6.672616958618164, + "learning_rate": 3.8606813840311985e-05, + "loss": 2.9473, + "step": 2199000 + }, + { + "epoch": 0.6837466018617678, + "grad_norm": 7.537691593170166, + "learning_rate": 3.860422330230387e-05, + "loss": 2.9549, + "step": 2199500 + }, + { + "epoch": 0.6839020341422547, + "grad_norm": 6.781294345855713, + "learning_rate": 3.860163276429576e-05, + "loss": 2.9298, + "step": 2200000 + }, + { + "epoch": 0.6840574664227416, + "grad_norm": 10.719715118408203, + "learning_rate": 3.859904222628764e-05, + "loss": 2.9737, + "step": 2200500 + }, + { + "epoch": 0.6842128987032284, + "grad_norm": 7.14894437789917, + "learning_rate": 3.8596451688279527e-05, + "loss": 2.9463, + "step": 2201000 + }, + { + "epoch": 0.6843683309837154, + "grad_norm": 9.284631729125977, + "learning_rate": 3.859386115027141e-05, + "loss": 2.9756, + "step": 2201500 + }, + { + "epoch": 0.6845237632642023, + "grad_norm": 7.845776557922363, + "learning_rate": 3.8591270612263294e-05, + "loss": 2.9496, + "step": 2202000 + }, + { + "epoch": 0.6846791955446891, + "grad_norm": 8.753191947937012, + "learning_rate": 3.858868007425518e-05, + "loss": 2.9613, + "step": 2202500 + }, + { + "epoch": 0.684834627825176, + "grad_norm": 11.113133430480957, + "learning_rate": 3.858608953624707e-05, + "loss": 2.926, + "step": 2203000 + }, + { + "epoch": 0.6849900601056629, + "grad_norm": 7.182265281677246, + "learning_rate": 3.8583498998238956e-05, + "loss": 2.9986, + "step": 2203500 + }, + { + "epoch": 0.6851454923861497, + "grad_norm": 7.359230041503906, + "learning_rate": 3.858090846023084e-05, + "loss": 2.9042, + "step": 2204000 + }, + { + "epoch": 0.6853009246666366, + "grad_norm": 10.038125038146973, + "learning_rate": 3.857831792222272e-05, + "loss": 2.9309, + "step": 2204500 + }, + { + "epoch": 0.6854563569471235, + "grad_norm": 14.837204933166504, + "learning_rate": 3.857572738421461e-05, + "loss": 2.9024, + "step": 2205000 + }, + { + "epoch": 0.6856117892276103, + "grad_norm": 21.008352279663086, + "learning_rate": 3.85731368462065e-05, + "loss": 2.9296, + "step": 2205500 + }, + { + "epoch": 0.6857672215080972, + "grad_norm": 8.59697437286377, + "learning_rate": 3.857054630819838e-05, + "loss": 2.9386, + "step": 2206000 + }, + { + "epoch": 0.6859226537885841, + "grad_norm": 20.99091911315918, + "learning_rate": 3.8567955770190265e-05, + "loss": 2.9939, + "step": 2206500 + }, + { + "epoch": 0.686078086069071, + "grad_norm": 9.863831520080566, + "learning_rate": 3.856536523218215e-05, + "loss": 2.9729, + "step": 2207000 + }, + { + "epoch": 0.6862335183495579, + "grad_norm": 7.829987525939941, + "learning_rate": 3.856277469417404e-05, + "loss": 2.9618, + "step": 2207500 + }, + { + "epoch": 0.6863889506300448, + "grad_norm": 14.178709030151367, + "learning_rate": 3.8560184156165926e-05, + "loss": 2.9624, + "step": 2208000 + }, + { + "epoch": 0.6865443829105317, + "grad_norm": 7.239316463470459, + "learning_rate": 3.855759361815781e-05, + "loss": 2.9195, + "step": 2208500 + }, + { + "epoch": 0.6866998151910185, + "grad_norm": 7.736423492431641, + "learning_rate": 3.8555003080149694e-05, + "loss": 2.9537, + "step": 2209000 + }, + { + "epoch": 0.6868552474715054, + "grad_norm": 8.740656852722168, + "learning_rate": 3.855241254214158e-05, + "loss": 2.9222, + "step": 2209500 + }, + { + "epoch": 0.6870106797519923, + "grad_norm": 8.40234661102295, + "learning_rate": 3.854982200413346e-05, + "loss": 2.9354, + "step": 2210000 + }, + { + "epoch": 0.6871661120324791, + "grad_norm": 9.628589630126953, + "learning_rate": 3.854723146612535e-05, + "loss": 2.9304, + "step": 2210500 + }, + { + "epoch": 0.687321544312966, + "grad_norm": 7.853554725646973, + "learning_rate": 3.8544640928117236e-05, + "loss": 2.946, + "step": 2211000 + }, + { + "epoch": 0.6874769765934529, + "grad_norm": 7.1966352462768555, + "learning_rate": 3.8542050390109116e-05, + "loss": 2.9349, + "step": 2211500 + }, + { + "epoch": 0.6876324088739397, + "grad_norm": 8.931255340576172, + "learning_rate": 3.8539459852101e-05, + "loss": 2.9615, + "step": 2212000 + }, + { + "epoch": 0.6877878411544266, + "grad_norm": 10.165812492370605, + "learning_rate": 3.853686931409289e-05, + "loss": 2.9444, + "step": 2212500 + }, + { + "epoch": 0.6879432734349135, + "grad_norm": 8.755301475524902, + "learning_rate": 3.853427877608478e-05, + "loss": 2.9221, + "step": 2213000 + }, + { + "epoch": 0.6880987057154004, + "grad_norm": 12.633366584777832, + "learning_rate": 3.8531688238076665e-05, + "loss": 2.9317, + "step": 2213500 + }, + { + "epoch": 0.6882541379958873, + "grad_norm": 9.421823501586914, + "learning_rate": 3.8529097700068545e-05, + "loss": 2.9664, + "step": 2214000 + }, + { + "epoch": 0.6884095702763742, + "grad_norm": 8.613665580749512, + "learning_rate": 3.852650716206043e-05, + "loss": 2.941, + "step": 2214500 + }, + { + "epoch": 0.688565002556861, + "grad_norm": 7.491003036499023, + "learning_rate": 3.852391662405232e-05, + "loss": 2.9322, + "step": 2215000 + }, + { + "epoch": 0.6887204348373479, + "grad_norm": 10.745080947875977, + "learning_rate": 3.85213260860442e-05, + "loss": 2.9611, + "step": 2215500 + }, + { + "epoch": 0.6888758671178348, + "grad_norm": 8.669526100158691, + "learning_rate": 3.851873554803609e-05, + "loss": 2.9118, + "step": 2216000 + }, + { + "epoch": 0.6890312993983216, + "grad_norm": 12.498026847839355, + "learning_rate": 3.8516145010027974e-05, + "loss": 2.9772, + "step": 2216500 + }, + { + "epoch": 0.6891867316788085, + "grad_norm": 18.17035484313965, + "learning_rate": 3.851355447201986e-05, + "loss": 2.9477, + "step": 2217000 + }, + { + "epoch": 0.6893421639592954, + "grad_norm": 8.781759262084961, + "learning_rate": 3.851096393401175e-05, + "loss": 2.9239, + "step": 2217500 + }, + { + "epoch": 0.6894975962397822, + "grad_norm": 7.818151950836182, + "learning_rate": 3.8508373396003636e-05, + "loss": 2.9247, + "step": 2218000 + }, + { + "epoch": 0.6896530285202691, + "grad_norm": 8.939647674560547, + "learning_rate": 3.8505782857995516e-05, + "loss": 2.9318, + "step": 2218500 + }, + { + "epoch": 0.689808460800756, + "grad_norm": 9.065452575683594, + "learning_rate": 3.85031923199874e-05, + "loss": 2.9487, + "step": 2219000 + }, + { + "epoch": 0.689963893081243, + "grad_norm": 6.709332466125488, + "learning_rate": 3.8500601781979283e-05, + "loss": 2.9616, + "step": 2219500 + }, + { + "epoch": 0.6901193253617298, + "grad_norm": 8.900562286376953, + "learning_rate": 3.849801124397117e-05, + "loss": 2.9492, + "step": 2220000 + }, + { + "epoch": 0.6902747576422167, + "grad_norm": 12.74808120727539, + "learning_rate": 3.849542070596306e-05, + "loss": 2.8891, + "step": 2220500 + }, + { + "epoch": 0.6904301899227036, + "grad_norm": 7.778544902801514, + "learning_rate": 3.849283016795494e-05, + "loss": 2.9569, + "step": 2221000 + }, + { + "epoch": 0.6905856222031904, + "grad_norm": 11.39822769165039, + "learning_rate": 3.8490239629946825e-05, + "loss": 2.9405, + "step": 2221500 + }, + { + "epoch": 0.6907410544836773, + "grad_norm": 8.317307472229004, + "learning_rate": 3.848764909193871e-05, + "loss": 3.0138, + "step": 2222000 + }, + { + "epoch": 0.6908964867641642, + "grad_norm": 9.61764907836914, + "learning_rate": 3.84850585539306e-05, + "loss": 2.9699, + "step": 2222500 + }, + { + "epoch": 0.691051919044651, + "grad_norm": 15.598971366882324, + "learning_rate": 3.848246801592249e-05, + "loss": 2.959, + "step": 2223000 + }, + { + "epoch": 0.6912073513251379, + "grad_norm": 10.159846305847168, + "learning_rate": 3.8479877477914374e-05, + "loss": 2.9255, + "step": 2223500 + }, + { + "epoch": 0.6913627836056248, + "grad_norm": 13.906012535095215, + "learning_rate": 3.8477286939906254e-05, + "loss": 2.9493, + "step": 2224000 + }, + { + "epoch": 0.6915182158861116, + "grad_norm": 7.8218092918396, + "learning_rate": 3.847469640189814e-05, + "loss": 2.9536, + "step": 2224500 + }, + { + "epoch": 0.6916736481665985, + "grad_norm": 7.019822120666504, + "learning_rate": 3.847210586389003e-05, + "loss": 2.9477, + "step": 2225000 + }, + { + "epoch": 0.6918290804470855, + "grad_norm": 10.430644035339355, + "learning_rate": 3.846951532588191e-05, + "loss": 2.9658, + "step": 2225500 + }, + { + "epoch": 0.6919845127275723, + "grad_norm": 6.082206726074219, + "learning_rate": 3.8466924787873796e-05, + "loss": 2.9278, + "step": 2226000 + }, + { + "epoch": 0.6921399450080592, + "grad_norm": 12.422155380249023, + "learning_rate": 3.846433424986568e-05, + "loss": 2.9534, + "step": 2226500 + }, + { + "epoch": 0.6922953772885461, + "grad_norm": 7.836662769317627, + "learning_rate": 3.846174371185757e-05, + "loss": 2.95, + "step": 2227000 + }, + { + "epoch": 0.6924508095690329, + "grad_norm": 8.278244018554688, + "learning_rate": 3.845915317384946e-05, + "loss": 2.9372, + "step": 2227500 + }, + { + "epoch": 0.6926062418495198, + "grad_norm": 10.513011932373047, + "learning_rate": 3.845656263584134e-05, + "loss": 2.9711, + "step": 2228000 + }, + { + "epoch": 0.6927616741300067, + "grad_norm": 14.961408615112305, + "learning_rate": 3.8453972097833225e-05, + "loss": 2.9938, + "step": 2228500 + }, + { + "epoch": 0.6929171064104935, + "grad_norm": 16.26396942138672, + "learning_rate": 3.845138155982511e-05, + "loss": 2.915, + "step": 2229000 + }, + { + "epoch": 0.6930725386909804, + "grad_norm": 7.070163726806641, + "learning_rate": 3.844879102181699e-05, + "loss": 2.9415, + "step": 2229500 + }, + { + "epoch": 0.6932279709714673, + "grad_norm": 13.802383422851562, + "learning_rate": 3.844620048380888e-05, + "loss": 2.9396, + "step": 2230000 + }, + { + "epoch": 0.6933834032519541, + "grad_norm": 8.666500091552734, + "learning_rate": 3.844360994580077e-05, + "loss": 2.9105, + "step": 2230500 + }, + { + "epoch": 0.693538835532441, + "grad_norm": 9.01004695892334, + "learning_rate": 3.844101940779265e-05, + "loss": 2.9447, + "step": 2231000 + }, + { + "epoch": 0.693694267812928, + "grad_norm": 10.274222373962402, + "learning_rate": 3.8438428869784535e-05, + "loss": 2.9663, + "step": 2231500 + }, + { + "epoch": 0.6938497000934148, + "grad_norm": 8.077537536621094, + "learning_rate": 3.843583833177642e-05, + "loss": 2.9066, + "step": 2232000 + }, + { + "epoch": 0.6940051323739017, + "grad_norm": 9.148347854614258, + "learning_rate": 3.843324779376831e-05, + "loss": 2.9476, + "step": 2232500 + }, + { + "epoch": 0.6941605646543886, + "grad_norm": 7.955427169799805, + "learning_rate": 3.8430657255760196e-05, + "loss": 2.9304, + "step": 2233000 + }, + { + "epoch": 0.6943159969348754, + "grad_norm": 12.526962280273438, + "learning_rate": 3.8428066717752076e-05, + "loss": 2.9561, + "step": 2233500 + }, + { + "epoch": 0.6944714292153623, + "grad_norm": 8.042696952819824, + "learning_rate": 3.8425476179743963e-05, + "loss": 2.9004, + "step": 2234000 + }, + { + "epoch": 0.6946268614958492, + "grad_norm": 11.400052070617676, + "learning_rate": 3.842288564173585e-05, + "loss": 2.926, + "step": 2234500 + }, + { + "epoch": 0.694782293776336, + "grad_norm": 7.678921222686768, + "learning_rate": 3.842029510372773e-05, + "loss": 2.9409, + "step": 2235000 + }, + { + "epoch": 0.6949377260568229, + "grad_norm": 9.183511734008789, + "learning_rate": 3.841770456571962e-05, + "loss": 2.9319, + "step": 2235500 + }, + { + "epoch": 0.6950931583373098, + "grad_norm": 8.83132553100586, + "learning_rate": 3.8415114027711505e-05, + "loss": 2.966, + "step": 2236000 + }, + { + "epoch": 0.6952485906177966, + "grad_norm": 7.313543319702148, + "learning_rate": 3.841252348970339e-05, + "loss": 2.9098, + "step": 2236500 + }, + { + "epoch": 0.6954040228982835, + "grad_norm": 9.041053771972656, + "learning_rate": 3.840993295169528e-05, + "loss": 2.9036, + "step": 2237000 + }, + { + "epoch": 0.6955594551787705, + "grad_norm": 9.42893123626709, + "learning_rate": 3.840734241368716e-05, + "loss": 2.9553, + "step": 2237500 + }, + { + "epoch": 0.6957148874592574, + "grad_norm": 7.482171058654785, + "learning_rate": 3.840475187567905e-05, + "loss": 2.9242, + "step": 2238000 + }, + { + "epoch": 0.6958703197397442, + "grad_norm": 8.003581047058105, + "learning_rate": 3.8402161337670934e-05, + "loss": 2.9551, + "step": 2238500 + }, + { + "epoch": 0.6960257520202311, + "grad_norm": 8.797994613647461, + "learning_rate": 3.8399570799662815e-05, + "loss": 2.923, + "step": 2239000 + }, + { + "epoch": 0.696181184300718, + "grad_norm": 7.4544782638549805, + "learning_rate": 3.83969802616547e-05, + "loss": 2.9435, + "step": 2239500 + }, + { + "epoch": 0.6963366165812048, + "grad_norm": 8.894843101501465, + "learning_rate": 3.839438972364659e-05, + "loss": 2.9483, + "step": 2240000 + }, + { + "epoch": 0.6964920488616917, + "grad_norm": 7.919556140899658, + "learning_rate": 3.839179918563847e-05, + "loss": 2.9718, + "step": 2240500 + }, + { + "epoch": 0.6966474811421786, + "grad_norm": 10.146053314208984, + "learning_rate": 3.8389208647630357e-05, + "loss": 2.9566, + "step": 2241000 + }, + { + "epoch": 0.6968029134226654, + "grad_norm": 8.382363319396973, + "learning_rate": 3.8386618109622244e-05, + "loss": 2.9263, + "step": 2241500 + }, + { + "epoch": 0.6969583457031523, + "grad_norm": 9.260991096496582, + "learning_rate": 3.838402757161413e-05, + "loss": 3.0133, + "step": 2242000 + }, + { + "epoch": 0.6971137779836392, + "grad_norm": 8.682186126708984, + "learning_rate": 3.838143703360602e-05, + "loss": 2.9749, + "step": 2242500 + }, + { + "epoch": 0.697269210264126, + "grad_norm": 9.578612327575684, + "learning_rate": 3.8378846495597905e-05, + "loss": 2.9028, + "step": 2243000 + }, + { + "epoch": 0.697424642544613, + "grad_norm": 6.927248954772949, + "learning_rate": 3.8376255957589786e-05, + "loss": 2.9347, + "step": 2243500 + }, + { + "epoch": 0.6975800748250999, + "grad_norm": 8.322017669677734, + "learning_rate": 3.837366541958167e-05, + "loss": 2.9134, + "step": 2244000 + }, + { + "epoch": 0.6977355071055867, + "grad_norm": 7.815359592437744, + "learning_rate": 3.837107488157355e-05, + "loss": 2.9632, + "step": 2244500 + }, + { + "epoch": 0.6978909393860736, + "grad_norm": 8.538209915161133, + "learning_rate": 3.836848434356544e-05, + "loss": 2.8745, + "step": 2245000 + }, + { + "epoch": 0.6980463716665605, + "grad_norm": 7.953689098358154, + "learning_rate": 3.836589380555733e-05, + "loss": 2.9722, + "step": 2245500 + }, + { + "epoch": 0.6982018039470473, + "grad_norm": 9.639273643493652, + "learning_rate": 3.836330326754921e-05, + "loss": 2.933, + "step": 2246000 + }, + { + "epoch": 0.6983572362275342, + "grad_norm": 8.373224258422852, + "learning_rate": 3.83607127295411e-05, + "loss": 2.9689, + "step": 2246500 + }, + { + "epoch": 0.6985126685080211, + "grad_norm": 9.59785270690918, + "learning_rate": 3.835812219153299e-05, + "loss": 2.9586, + "step": 2247000 + }, + { + "epoch": 0.6986681007885079, + "grad_norm": 13.101896286010742, + "learning_rate": 3.835553165352487e-05, + "loss": 2.9221, + "step": 2247500 + }, + { + "epoch": 0.6988235330689948, + "grad_norm": 7.817811965942383, + "learning_rate": 3.8352941115516756e-05, + "loss": 2.9645, + "step": 2248000 + }, + { + "epoch": 0.6989789653494817, + "grad_norm": 8.826669692993164, + "learning_rate": 3.8350350577508644e-05, + "loss": 2.9433, + "step": 2248500 + }, + { + "epoch": 0.6991343976299685, + "grad_norm": 22.732566833496094, + "learning_rate": 3.8347760039500524e-05, + "loss": 2.9221, + "step": 2249000 + }, + { + "epoch": 0.6992898299104555, + "grad_norm": 12.16872501373291, + "learning_rate": 3.834516950149241e-05, + "loss": 2.893, + "step": 2249500 + }, + { + "epoch": 0.6994452621909424, + "grad_norm": 6.701883316040039, + "learning_rate": 3.834257896348429e-05, + "loss": 3.0142, + "step": 2250000 + }, + { + "epoch": 0.6996006944714293, + "grad_norm": 9.109418869018555, + "learning_rate": 3.833998842547618e-05, + "loss": 2.9257, + "step": 2250500 + }, + { + "epoch": 0.6997561267519161, + "grad_norm": 8.87031078338623, + "learning_rate": 3.8337397887468066e-05, + "loss": 2.8963, + "step": 2251000 + }, + { + "epoch": 0.699911559032403, + "grad_norm": 8.53616714477539, + "learning_rate": 3.833480734945995e-05, + "loss": 2.9452, + "step": 2251500 + }, + { + "epoch": 0.7000669913128899, + "grad_norm": 7.585029125213623, + "learning_rate": 3.833221681145184e-05, + "loss": 2.9368, + "step": 2252000 + }, + { + "epoch": 0.7002224235933767, + "grad_norm": 8.78255844116211, + "learning_rate": 3.832962627344373e-05, + "loss": 2.9476, + "step": 2252500 + }, + { + "epoch": 0.7003778558738636, + "grad_norm": 8.951521873474121, + "learning_rate": 3.832703573543561e-05, + "loss": 2.8883, + "step": 2253000 + }, + { + "epoch": 0.7005332881543505, + "grad_norm": 9.293209075927734, + "learning_rate": 3.8324445197427495e-05, + "loss": 2.9301, + "step": 2253500 + }, + { + "epoch": 0.7006887204348373, + "grad_norm": 8.931901931762695, + "learning_rate": 3.832185465941938e-05, + "loss": 2.9346, + "step": 2254000 + }, + { + "epoch": 0.7008441527153242, + "grad_norm": 8.488280296325684, + "learning_rate": 3.831926412141126e-05, + "loss": 2.9543, + "step": 2254500 + }, + { + "epoch": 0.700999584995811, + "grad_norm": 8.309814453125, + "learning_rate": 3.831667358340315e-05, + "loss": 2.9707, + "step": 2255000 + }, + { + "epoch": 0.701155017276298, + "grad_norm": 11.566980361938477, + "learning_rate": 3.831408304539503e-05, + "loss": 2.9869, + "step": 2255500 + }, + { + "epoch": 0.7013104495567849, + "grad_norm": 8.199007987976074, + "learning_rate": 3.831149250738692e-05, + "loss": 2.9562, + "step": 2256000 + }, + { + "epoch": 0.7014658818372718, + "grad_norm": 7.942444801330566, + "learning_rate": 3.830890196937881e-05, + "loss": 2.9278, + "step": 2256500 + }, + { + "epoch": 0.7016213141177586, + "grad_norm": 9.93589973449707, + "learning_rate": 3.830631143137069e-05, + "loss": 2.9246, + "step": 2257000 + }, + { + "epoch": 0.7017767463982455, + "grad_norm": 9.215386390686035, + "learning_rate": 3.830372089336258e-05, + "loss": 2.9621, + "step": 2257500 + }, + { + "epoch": 0.7019321786787324, + "grad_norm": 9.64030647277832, + "learning_rate": 3.8301130355354466e-05, + "loss": 2.9609, + "step": 2258000 + }, + { + "epoch": 0.7020876109592192, + "grad_norm": 10.043825149536133, + "learning_rate": 3.8298539817346346e-05, + "loss": 2.9765, + "step": 2258500 + }, + { + "epoch": 0.7022430432397061, + "grad_norm": 9.813098907470703, + "learning_rate": 3.829594927933823e-05, + "loss": 2.9187, + "step": 2259000 + }, + { + "epoch": 0.702398475520193, + "grad_norm": 6.630639553070068, + "learning_rate": 3.829335874133012e-05, + "loss": 2.9594, + "step": 2259500 + }, + { + "epoch": 0.7025539078006798, + "grad_norm": 8.692965507507324, + "learning_rate": 3.8290768203322e-05, + "loss": 2.9219, + "step": 2260000 + }, + { + "epoch": 0.7027093400811667, + "grad_norm": 7.49493932723999, + "learning_rate": 3.828817766531389e-05, + "loss": 2.8686, + "step": 2260500 + }, + { + "epoch": 0.7028647723616536, + "grad_norm": 9.509352684020996, + "learning_rate": 3.8285587127305775e-05, + "loss": 2.9486, + "step": 2261000 + }, + { + "epoch": 0.7030202046421404, + "grad_norm": 7.748218059539795, + "learning_rate": 3.828299658929766e-05, + "loss": 2.8425, + "step": 2261500 + }, + { + "epoch": 0.7031756369226274, + "grad_norm": 6.136786460876465, + "learning_rate": 3.828040605128955e-05, + "loss": 2.8816, + "step": 2262000 + }, + { + "epoch": 0.7033310692031143, + "grad_norm": 11.084354400634766, + "learning_rate": 3.827781551328143e-05, + "loss": 2.9194, + "step": 2262500 + }, + { + "epoch": 0.7034865014836011, + "grad_norm": 9.339425086975098, + "learning_rate": 3.827522497527332e-05, + "loss": 2.9241, + "step": 2263000 + }, + { + "epoch": 0.703641933764088, + "grad_norm": 8.66921615600586, + "learning_rate": 3.8272634437265204e-05, + "loss": 2.9472, + "step": 2263500 + }, + { + "epoch": 0.7037973660445749, + "grad_norm": 8.872159957885742, + "learning_rate": 3.8270043899257084e-05, + "loss": 2.955, + "step": 2264000 + }, + { + "epoch": 0.7039527983250617, + "grad_norm": 8.246291160583496, + "learning_rate": 3.826745336124897e-05, + "loss": 2.9186, + "step": 2264500 + }, + { + "epoch": 0.7041082306055486, + "grad_norm": 11.288674354553223, + "learning_rate": 3.826486282324086e-05, + "loss": 2.9484, + "step": 2265000 + }, + { + "epoch": 0.7042636628860355, + "grad_norm": 9.514851570129395, + "learning_rate": 3.826227228523274e-05, + "loss": 2.9452, + "step": 2265500 + }, + { + "epoch": 0.7044190951665223, + "grad_norm": 6.838502883911133, + "learning_rate": 3.8259681747224626e-05, + "loss": 2.954, + "step": 2266000 + }, + { + "epoch": 0.7045745274470092, + "grad_norm": 8.56899642944336, + "learning_rate": 3.825709120921652e-05, + "loss": 2.9427, + "step": 2266500 + }, + { + "epoch": 0.7047299597274961, + "grad_norm": 7.354959011077881, + "learning_rate": 3.82545006712084e-05, + "loss": 2.9474, + "step": 2267000 + }, + { + "epoch": 0.704885392007983, + "grad_norm": 8.814720153808594, + "learning_rate": 3.825191013320029e-05, + "loss": 2.9459, + "step": 2267500 + }, + { + "epoch": 0.7050408242884699, + "grad_norm": 6.55785608291626, + "learning_rate": 3.824931959519217e-05, + "loss": 2.91, + "step": 2268000 + }, + { + "epoch": 0.7051962565689568, + "grad_norm": 27.636178970336914, + "learning_rate": 3.8246729057184055e-05, + "loss": 2.9311, + "step": 2268500 + }, + { + "epoch": 0.7053516888494437, + "grad_norm": 9.630851745605469, + "learning_rate": 3.824413851917594e-05, + "loss": 2.9516, + "step": 2269000 + }, + { + "epoch": 0.7055071211299305, + "grad_norm": 8.93175983428955, + "learning_rate": 3.824154798116782e-05, + "loss": 2.8942, + "step": 2269500 + }, + { + "epoch": 0.7056625534104174, + "grad_norm": 8.146409034729004, + "learning_rate": 3.823895744315971e-05, + "loss": 2.9572, + "step": 2270000 + }, + { + "epoch": 0.7058179856909043, + "grad_norm": 10.440580368041992, + "learning_rate": 3.82363669051516e-05, + "loss": 2.8869, + "step": 2270500 + }, + { + "epoch": 0.7059734179713911, + "grad_norm": 9.721216201782227, + "learning_rate": 3.8233776367143484e-05, + "loss": 2.9034, + "step": 2271000 + }, + { + "epoch": 0.706128850251878, + "grad_norm": 6.658566474914551, + "learning_rate": 3.823118582913537e-05, + "loss": 2.9651, + "step": 2271500 + }, + { + "epoch": 0.7062842825323649, + "grad_norm": 9.809502601623535, + "learning_rate": 3.822859529112726e-05, + "loss": 2.9419, + "step": 2272000 + }, + { + "epoch": 0.7064397148128517, + "grad_norm": 6.95580530166626, + "learning_rate": 3.822600475311914e-05, + "loss": 2.9654, + "step": 2272500 + }, + { + "epoch": 0.7065951470933386, + "grad_norm": 8.298676490783691, + "learning_rate": 3.8223414215111026e-05, + "loss": 2.9079, + "step": 2273000 + }, + { + "epoch": 0.7067505793738255, + "grad_norm": 6.739914894104004, + "learning_rate": 3.822082367710291e-05, + "loss": 2.8974, + "step": 2273500 + }, + { + "epoch": 0.7069060116543124, + "grad_norm": 9.189603805541992, + "learning_rate": 3.8218233139094793e-05, + "loss": 2.9343, + "step": 2274000 + }, + { + "epoch": 0.7070614439347993, + "grad_norm": 8.835741996765137, + "learning_rate": 3.821564260108668e-05, + "loss": 2.8967, + "step": 2274500 + }, + { + "epoch": 0.7072168762152862, + "grad_norm": 9.940936088562012, + "learning_rate": 3.821305206307856e-05, + "loss": 2.9578, + "step": 2275000 + }, + { + "epoch": 0.707372308495773, + "grad_norm": 8.497230529785156, + "learning_rate": 3.821046152507045e-05, + "loss": 2.9123, + "step": 2275500 + }, + { + "epoch": 0.7075277407762599, + "grad_norm": 8.802855491638184, + "learning_rate": 3.8207870987062335e-05, + "loss": 2.9387, + "step": 2276000 + }, + { + "epoch": 0.7076831730567468, + "grad_norm": 7.368080139160156, + "learning_rate": 3.820528044905422e-05, + "loss": 2.9692, + "step": 2276500 + }, + { + "epoch": 0.7078386053372336, + "grad_norm": 7.226041316986084, + "learning_rate": 3.820268991104611e-05, + "loss": 2.9261, + "step": 2277000 + }, + { + "epoch": 0.7079940376177205, + "grad_norm": 7.931822776794434, + "learning_rate": 3.8200099373038e-05, + "loss": 2.9628, + "step": 2277500 + }, + { + "epoch": 0.7081494698982074, + "grad_norm": 8.834148406982422, + "learning_rate": 3.819750883502988e-05, + "loss": 2.9303, + "step": 2278000 + }, + { + "epoch": 0.7083049021786942, + "grad_norm": 7.913825511932373, + "learning_rate": 3.8194918297021764e-05, + "loss": 2.9315, + "step": 2278500 + }, + { + "epoch": 0.7084603344591811, + "grad_norm": 9.984018325805664, + "learning_rate": 3.819232775901365e-05, + "loss": 2.9818, + "step": 2279000 + }, + { + "epoch": 0.708615766739668, + "grad_norm": 7.799585819244385, + "learning_rate": 3.818973722100553e-05, + "loss": 2.9682, + "step": 2279500 + }, + { + "epoch": 0.708771199020155, + "grad_norm": 6.242628574371338, + "learning_rate": 3.818714668299742e-05, + "loss": 2.9331, + "step": 2280000 + }, + { + "epoch": 0.7089266313006418, + "grad_norm": 5.806081295013428, + "learning_rate": 3.8184556144989306e-05, + "loss": 2.9448, + "step": 2280500 + }, + { + "epoch": 0.7090820635811287, + "grad_norm": 23.7719669342041, + "learning_rate": 3.818196560698119e-05, + "loss": 2.94, + "step": 2281000 + }, + { + "epoch": 0.7092374958616156, + "grad_norm": 10.548627853393555, + "learning_rate": 3.817937506897308e-05, + "loss": 2.8814, + "step": 2281500 + }, + { + "epoch": 0.7093929281421024, + "grad_norm": 8.226519584655762, + "learning_rate": 3.817678453096496e-05, + "loss": 2.9334, + "step": 2282000 + }, + { + "epoch": 0.7095483604225893, + "grad_norm": 6.330270767211914, + "learning_rate": 3.817419399295685e-05, + "loss": 2.9387, + "step": 2282500 + }, + { + "epoch": 0.7097037927030762, + "grad_norm": 7.0007004737854, + "learning_rate": 3.8171603454948735e-05, + "loss": 2.913, + "step": 2283000 + }, + { + "epoch": 0.709859224983563, + "grad_norm": 8.96531867980957, + "learning_rate": 3.8169012916940615e-05, + "loss": 2.8883, + "step": 2283500 + }, + { + "epoch": 0.7100146572640499, + "grad_norm": 8.625533103942871, + "learning_rate": 3.81664223789325e-05, + "loss": 2.9156, + "step": 2284000 + }, + { + "epoch": 0.7101700895445368, + "grad_norm": 6.224393367767334, + "learning_rate": 3.816383184092439e-05, + "loss": 2.9036, + "step": 2284500 + }, + { + "epoch": 0.7103255218250236, + "grad_norm": 9.394339561462402, + "learning_rate": 3.816124130291627e-05, + "loss": 2.9149, + "step": 2285000 + }, + { + "epoch": 0.7104809541055105, + "grad_norm": 49.11686325073242, + "learning_rate": 3.815865076490816e-05, + "loss": 2.9521, + "step": 2285500 + }, + { + "epoch": 0.7106363863859975, + "grad_norm": 6.320302963256836, + "learning_rate": 3.8156060226900044e-05, + "loss": 2.9494, + "step": 2286000 + }, + { + "epoch": 0.7107918186664843, + "grad_norm": 8.04515552520752, + "learning_rate": 3.815346968889193e-05, + "loss": 2.8751, + "step": 2286500 + }, + { + "epoch": 0.7109472509469712, + "grad_norm": 7.4657111167907715, + "learning_rate": 3.815087915088382e-05, + "loss": 2.8948, + "step": 2287000 + }, + { + "epoch": 0.7111026832274581, + "grad_norm": 8.240296363830566, + "learning_rate": 3.81482886128757e-05, + "loss": 2.9339, + "step": 2287500 + }, + { + "epoch": 0.7112581155079449, + "grad_norm": 9.750374794006348, + "learning_rate": 3.8145698074867586e-05, + "loss": 2.9369, + "step": 2288000 + }, + { + "epoch": 0.7114135477884318, + "grad_norm": 8.855311393737793, + "learning_rate": 3.8143107536859473e-05, + "loss": 2.931, + "step": 2288500 + }, + { + "epoch": 0.7115689800689187, + "grad_norm": 7.167175769805908, + "learning_rate": 3.8140516998851354e-05, + "loss": 2.9921, + "step": 2289000 + }, + { + "epoch": 0.7117244123494055, + "grad_norm": 5.71364688873291, + "learning_rate": 3.813792646084324e-05, + "loss": 2.9519, + "step": 2289500 + }, + { + "epoch": 0.7118798446298924, + "grad_norm": 7.769402027130127, + "learning_rate": 3.813533592283513e-05, + "loss": 2.9494, + "step": 2290000 + }, + { + "epoch": 0.7120352769103793, + "grad_norm": 6.655770778656006, + "learning_rate": 3.8132745384827015e-05, + "loss": 2.9318, + "step": 2290500 + }, + { + "epoch": 0.7121907091908661, + "grad_norm": 6.5162553787231445, + "learning_rate": 3.81301548468189e-05, + "loss": 2.9404, + "step": 2291000 + }, + { + "epoch": 0.712346141471353, + "grad_norm": 7.338933944702148, + "learning_rate": 3.812756430881079e-05, + "loss": 2.973, + "step": 2291500 + }, + { + "epoch": 0.71250157375184, + "grad_norm": 12.796747207641602, + "learning_rate": 3.812497377080267e-05, + "loss": 2.9605, + "step": 2292000 + }, + { + "epoch": 0.7126570060323268, + "grad_norm": 8.59231185913086, + "learning_rate": 3.812238323279456e-05, + "loss": 2.8857, + "step": 2292500 + }, + { + "epoch": 0.7128124383128137, + "grad_norm": 7.802721977233887, + "learning_rate": 3.811979269478644e-05, + "loss": 2.9356, + "step": 2293000 + }, + { + "epoch": 0.7129678705933006, + "grad_norm": 11.51025390625, + "learning_rate": 3.8117202156778325e-05, + "loss": 2.9017, + "step": 2293500 + }, + { + "epoch": 0.7131233028737874, + "grad_norm": 8.422901153564453, + "learning_rate": 3.811461161877021e-05, + "loss": 2.9633, + "step": 2294000 + }, + { + "epoch": 0.7132787351542743, + "grad_norm": 6.592129707336426, + "learning_rate": 3.811202108076209e-05, + "loss": 2.9371, + "step": 2294500 + }, + { + "epoch": 0.7134341674347612, + "grad_norm": 9.75643539428711, + "learning_rate": 3.810943054275398e-05, + "loss": 2.9575, + "step": 2295000 + }, + { + "epoch": 0.713589599715248, + "grad_norm": 16.921005249023438, + "learning_rate": 3.8106840004745867e-05, + "loss": 2.9124, + "step": 2295500 + }, + { + "epoch": 0.7137450319957349, + "grad_norm": 8.14046859741211, + "learning_rate": 3.8104249466737754e-05, + "loss": 2.9408, + "step": 2296000 + }, + { + "epoch": 0.7139004642762218, + "grad_norm": 13.411778450012207, + "learning_rate": 3.810165892872964e-05, + "loss": 2.9157, + "step": 2296500 + }, + { + "epoch": 0.7140558965567086, + "grad_norm": 7.558412075042725, + "learning_rate": 3.809906839072153e-05, + "loss": 2.9497, + "step": 2297000 + }, + { + "epoch": 0.7142113288371955, + "grad_norm": 10.559947967529297, + "learning_rate": 3.809647785271341e-05, + "loss": 2.9602, + "step": 2297500 + }, + { + "epoch": 0.7143667611176825, + "grad_norm": 9.166895866394043, + "learning_rate": 3.8093887314705296e-05, + "loss": 2.9342, + "step": 2298000 + }, + { + "epoch": 0.7145221933981694, + "grad_norm": 7.751404285430908, + "learning_rate": 3.8091296776697176e-05, + "loss": 2.9555, + "step": 2298500 + }, + { + "epoch": 0.7146776256786562, + "grad_norm": 7.822848796844482, + "learning_rate": 3.808870623868906e-05, + "loss": 2.9929, + "step": 2299000 + }, + { + "epoch": 0.7148330579591431, + "grad_norm": 14.352851867675781, + "learning_rate": 3.808611570068095e-05, + "loss": 2.9577, + "step": 2299500 + }, + { + "epoch": 0.71498849023963, + "grad_norm": 9.326862335205078, + "learning_rate": 3.808352516267284e-05, + "loss": 2.8973, + "step": 2300000 + }, + { + "epoch": 0.7151439225201168, + "grad_norm": 19.03985595703125, + "learning_rate": 3.8080934624664725e-05, + "loss": 2.9316, + "step": 2300500 + }, + { + "epoch": 0.7152993548006037, + "grad_norm": 21.00799560546875, + "learning_rate": 3.807834408665661e-05, + "loss": 2.9081, + "step": 2301000 + }, + { + "epoch": 0.7154547870810906, + "grad_norm": 6.099493980407715, + "learning_rate": 3.807575354864849e-05, + "loss": 2.9156, + "step": 2301500 + }, + { + "epoch": 0.7156102193615774, + "grad_norm": 7.943130016326904, + "learning_rate": 3.807316301064038e-05, + "loss": 2.9263, + "step": 2302000 + }, + { + "epoch": 0.7157656516420643, + "grad_norm": 19.97551918029785, + "learning_rate": 3.8070572472632266e-05, + "loss": 2.9656, + "step": 2302500 + }, + { + "epoch": 0.7159210839225512, + "grad_norm": 8.685614585876465, + "learning_rate": 3.806798193462415e-05, + "loss": 2.9446, + "step": 2303000 + }, + { + "epoch": 0.716076516203038, + "grad_norm": 7.68137264251709, + "learning_rate": 3.8065391396616034e-05, + "loss": 2.9011, + "step": 2303500 + }, + { + "epoch": 0.716231948483525, + "grad_norm": 9.387707710266113, + "learning_rate": 3.8062800858607914e-05, + "loss": 2.9377, + "step": 2304000 + }, + { + "epoch": 0.7163873807640119, + "grad_norm": 11.291236877441406, + "learning_rate": 3.80602103205998e-05, + "loss": 2.9384, + "step": 2304500 + }, + { + "epoch": 0.7165428130444987, + "grad_norm": 8.572405815124512, + "learning_rate": 3.805761978259169e-05, + "loss": 2.9369, + "step": 2305000 + }, + { + "epoch": 0.7166982453249856, + "grad_norm": 18.201841354370117, + "learning_rate": 3.8055029244583576e-05, + "loss": 2.9178, + "step": 2305500 + }, + { + "epoch": 0.7168536776054725, + "grad_norm": 5.9560441970825195, + "learning_rate": 3.805243870657546e-05, + "loss": 2.9282, + "step": 2306000 + }, + { + "epoch": 0.7170091098859593, + "grad_norm": 8.413790702819824, + "learning_rate": 3.804984816856735e-05, + "loss": 2.9262, + "step": 2306500 + }, + { + "epoch": 0.7171645421664462, + "grad_norm": 20.347848892211914, + "learning_rate": 3.804725763055923e-05, + "loss": 2.957, + "step": 2307000 + }, + { + "epoch": 0.7173199744469331, + "grad_norm": 11.406888961791992, + "learning_rate": 3.804466709255112e-05, + "loss": 2.9141, + "step": 2307500 + }, + { + "epoch": 0.7174754067274199, + "grad_norm": 7.0285773277282715, + "learning_rate": 3.8042076554543005e-05, + "loss": 2.9251, + "step": 2308000 + }, + { + "epoch": 0.7176308390079068, + "grad_norm": 10.043110847473145, + "learning_rate": 3.8039486016534885e-05, + "loss": 2.932, + "step": 2308500 + }, + { + "epoch": 0.7177862712883937, + "grad_norm": 9.761016845703125, + "learning_rate": 3.803689547852677e-05, + "loss": 2.9211, + "step": 2309000 + }, + { + "epoch": 0.7179417035688805, + "grad_norm": 8.585451126098633, + "learning_rate": 3.803430494051866e-05, + "loss": 2.9436, + "step": 2309500 + }, + { + "epoch": 0.7180971358493675, + "grad_norm": 8.8250093460083, + "learning_rate": 3.8031714402510547e-05, + "loss": 2.9771, + "step": 2310000 + }, + { + "epoch": 0.7182525681298544, + "grad_norm": 9.41306209564209, + "learning_rate": 3.8029123864502434e-05, + "loss": 2.9251, + "step": 2310500 + }, + { + "epoch": 0.7184080004103413, + "grad_norm": 7.4682230949401855, + "learning_rate": 3.8026533326494314e-05, + "loss": 2.9266, + "step": 2311000 + }, + { + "epoch": 0.7185634326908281, + "grad_norm": 8.644424438476562, + "learning_rate": 3.80239427884862e-05, + "loss": 2.9608, + "step": 2311500 + }, + { + "epoch": 0.718718864971315, + "grad_norm": 7.918579578399658, + "learning_rate": 3.802135225047809e-05, + "loss": 2.9154, + "step": 2312000 + }, + { + "epoch": 0.7188742972518019, + "grad_norm": 11.614654541015625, + "learning_rate": 3.801876171246997e-05, + "loss": 2.9328, + "step": 2312500 + }, + { + "epoch": 0.7190297295322887, + "grad_norm": 11.966257095336914, + "learning_rate": 3.8016171174461856e-05, + "loss": 2.9179, + "step": 2313000 + }, + { + "epoch": 0.7191851618127756, + "grad_norm": 9.346205711364746, + "learning_rate": 3.801358063645374e-05, + "loss": 2.9257, + "step": 2313500 + }, + { + "epoch": 0.7193405940932625, + "grad_norm": 9.351652145385742, + "learning_rate": 3.8010990098445623e-05, + "loss": 2.9206, + "step": 2314000 + }, + { + "epoch": 0.7194960263737493, + "grad_norm": 13.508544921875, + "learning_rate": 3.800839956043751e-05, + "loss": 2.9255, + "step": 2314500 + }, + { + "epoch": 0.7196514586542362, + "grad_norm": 8.957846641540527, + "learning_rate": 3.80058090224294e-05, + "loss": 2.9325, + "step": 2315000 + }, + { + "epoch": 0.719806890934723, + "grad_norm": 7.344271183013916, + "learning_rate": 3.8003218484421285e-05, + "loss": 2.9612, + "step": 2315500 + }, + { + "epoch": 0.71996232321521, + "grad_norm": 12.726029396057129, + "learning_rate": 3.800062794641317e-05, + "loss": 2.9045, + "step": 2316000 + }, + { + "epoch": 0.7201177554956969, + "grad_norm": 10.373488426208496, + "learning_rate": 3.799803740840505e-05, + "loss": 2.9351, + "step": 2316500 + }, + { + "epoch": 0.7202731877761838, + "grad_norm": 9.366531372070312, + "learning_rate": 3.799544687039694e-05, + "loss": 2.9424, + "step": 2317000 + }, + { + "epoch": 0.7204286200566706, + "grad_norm": 13.25598430633545, + "learning_rate": 3.799285633238883e-05, + "loss": 2.9781, + "step": 2317500 + }, + { + "epoch": 0.7205840523371575, + "grad_norm": 12.937682151794434, + "learning_rate": 3.799026579438071e-05, + "loss": 2.8998, + "step": 2318000 + }, + { + "epoch": 0.7207394846176444, + "grad_norm": 39.65422821044922, + "learning_rate": 3.7987675256372594e-05, + "loss": 2.9471, + "step": 2318500 + }, + { + "epoch": 0.7208949168981312, + "grad_norm": 9.401334762573242, + "learning_rate": 3.798508471836448e-05, + "loss": 2.9448, + "step": 2319000 + }, + { + "epoch": 0.7210503491786181, + "grad_norm": 7.607376575469971, + "learning_rate": 3.798249418035636e-05, + "loss": 2.9536, + "step": 2319500 + }, + { + "epoch": 0.721205781459105, + "grad_norm": 8.959195137023926, + "learning_rate": 3.7979903642348256e-05, + "loss": 2.993, + "step": 2320000 + }, + { + "epoch": 0.7213612137395918, + "grad_norm": 8.90933609008789, + "learning_rate": 3.797731310434014e-05, + "loss": 2.9464, + "step": 2320500 + }, + { + "epoch": 0.7215166460200787, + "grad_norm": 7.316103458404541, + "learning_rate": 3.797472256633202e-05, + "loss": 2.9046, + "step": 2321000 + }, + { + "epoch": 0.7216720783005656, + "grad_norm": 8.686859130859375, + "learning_rate": 3.797213202832391e-05, + "loss": 2.8939, + "step": 2321500 + }, + { + "epoch": 0.7218275105810525, + "grad_norm": 8.932562828063965, + "learning_rate": 3.796954149031579e-05, + "loss": 2.9178, + "step": 2322000 + }, + { + "epoch": 0.7219829428615394, + "grad_norm": 12.923377990722656, + "learning_rate": 3.796695095230768e-05, + "loss": 2.9341, + "step": 2322500 + }, + { + "epoch": 0.7221383751420263, + "grad_norm": 13.669642448425293, + "learning_rate": 3.7964360414299565e-05, + "loss": 2.9163, + "step": 2323000 + }, + { + "epoch": 0.7222938074225131, + "grad_norm": 9.346426963806152, + "learning_rate": 3.7961769876291445e-05, + "loss": 2.9793, + "step": 2323500 + }, + { + "epoch": 0.722449239703, + "grad_norm": 10.354857444763184, + "learning_rate": 3.795917933828333e-05, + "loss": 2.8932, + "step": 2324000 + }, + { + "epoch": 0.7226046719834869, + "grad_norm": 9.237016677856445, + "learning_rate": 3.795658880027522e-05, + "loss": 2.8886, + "step": 2324500 + }, + { + "epoch": 0.7227601042639737, + "grad_norm": 9.245830535888672, + "learning_rate": 3.795399826226711e-05, + "loss": 2.9211, + "step": 2325000 + }, + { + "epoch": 0.7229155365444606, + "grad_norm": 12.566951751708984, + "learning_rate": 3.7951407724258994e-05, + "loss": 2.9553, + "step": 2325500 + }, + { + "epoch": 0.7230709688249475, + "grad_norm": 7.376453876495361, + "learning_rate": 3.794881718625088e-05, + "loss": 2.9789, + "step": 2326000 + }, + { + "epoch": 0.7232264011054343, + "grad_norm": 15.068887710571289, + "learning_rate": 3.794622664824276e-05, + "loss": 2.9123, + "step": 2326500 + }, + { + "epoch": 0.7233818333859212, + "grad_norm": 9.707876205444336, + "learning_rate": 3.794363611023465e-05, + "loss": 2.9617, + "step": 2327000 + }, + { + "epoch": 0.7235372656664081, + "grad_norm": 7.037379741668701, + "learning_rate": 3.7941045572226536e-05, + "loss": 2.9264, + "step": 2327500 + }, + { + "epoch": 0.7236926979468951, + "grad_norm": 9.24193000793457, + "learning_rate": 3.7938455034218416e-05, + "loss": 2.977, + "step": 2328000 + }, + { + "epoch": 0.7238481302273819, + "grad_norm": 10.2405424118042, + "learning_rate": 3.7935864496210303e-05, + "loss": 2.9395, + "step": 2328500 + }, + { + "epoch": 0.7240035625078688, + "grad_norm": 9.421602249145508, + "learning_rate": 3.7933273958202184e-05, + "loss": 2.9365, + "step": 2329000 + }, + { + "epoch": 0.7241589947883557, + "grad_norm": 8.947528839111328, + "learning_rate": 3.793068342019407e-05, + "loss": 2.9499, + "step": 2329500 + }, + { + "epoch": 0.7243144270688425, + "grad_norm": 11.334060668945312, + "learning_rate": 3.7928092882185965e-05, + "loss": 2.8671, + "step": 2330000 + }, + { + "epoch": 0.7244698593493294, + "grad_norm": 11.241881370544434, + "learning_rate": 3.7925502344177845e-05, + "loss": 2.9554, + "step": 2330500 + }, + { + "epoch": 0.7246252916298163, + "grad_norm": 8.28197956085205, + "learning_rate": 3.792291180616973e-05, + "loss": 2.9117, + "step": 2331000 + }, + { + "epoch": 0.7247807239103031, + "grad_norm": 10.662235260009766, + "learning_rate": 3.792032126816162e-05, + "loss": 2.9099, + "step": 2331500 + }, + { + "epoch": 0.72493615619079, + "grad_norm": 6.730345249176025, + "learning_rate": 3.79177307301535e-05, + "loss": 2.967, + "step": 2332000 + }, + { + "epoch": 0.7250915884712769, + "grad_norm": 11.866671562194824, + "learning_rate": 3.791514019214539e-05, + "loss": 2.8922, + "step": 2332500 + }, + { + "epoch": 0.7252470207517637, + "grad_norm": 6.810244083404541, + "learning_rate": 3.7912549654137274e-05, + "loss": 2.895, + "step": 2333000 + }, + { + "epoch": 0.7254024530322506, + "grad_norm": 11.708357810974121, + "learning_rate": 3.7909959116129155e-05, + "loss": 2.9185, + "step": 2333500 + }, + { + "epoch": 0.7255578853127376, + "grad_norm": 9.356689453125, + "learning_rate": 3.790736857812104e-05, + "loss": 2.9342, + "step": 2334000 + }, + { + "epoch": 0.7257133175932244, + "grad_norm": 9.67790412902832, + "learning_rate": 3.790477804011293e-05, + "loss": 2.9354, + "step": 2334500 + }, + { + "epoch": 0.7258687498737113, + "grad_norm": 9.316762924194336, + "learning_rate": 3.7902187502104816e-05, + "loss": 2.9222, + "step": 2335000 + }, + { + "epoch": 0.7260241821541982, + "grad_norm": 10.163370132446289, + "learning_rate": 3.78995969640967e-05, + "loss": 2.9644, + "step": 2335500 + }, + { + "epoch": 0.726179614434685, + "grad_norm": 6.619877338409424, + "learning_rate": 3.7897006426088584e-05, + "loss": 3.0128, + "step": 2336000 + }, + { + "epoch": 0.7263350467151719, + "grad_norm": 9.536344528198242, + "learning_rate": 3.789441588808047e-05, + "loss": 2.8991, + "step": 2336500 + }, + { + "epoch": 0.7264904789956588, + "grad_norm": 7.278010368347168, + "learning_rate": 3.789182535007236e-05, + "loss": 2.9258, + "step": 2337000 + }, + { + "epoch": 0.7266459112761456, + "grad_norm": 7.804418563842773, + "learning_rate": 3.788923481206424e-05, + "loss": 2.9984, + "step": 2337500 + }, + { + "epoch": 0.7268013435566325, + "grad_norm": 9.311053276062012, + "learning_rate": 3.7886644274056125e-05, + "loss": 2.9582, + "step": 2338000 + }, + { + "epoch": 0.7269567758371194, + "grad_norm": 10.002716064453125, + "learning_rate": 3.788405373604801e-05, + "loss": 2.9297, + "step": 2338500 + }, + { + "epoch": 0.7271122081176062, + "grad_norm": 8.807714462280273, + "learning_rate": 3.788146319803989e-05, + "loss": 2.9407, + "step": 2339000 + }, + { + "epoch": 0.7272676403980931, + "grad_norm": 7.601135730743408, + "learning_rate": 3.787887266003178e-05, + "loss": 2.8873, + "step": 2339500 + }, + { + "epoch": 0.7274230726785801, + "grad_norm": 13.09615707397461, + "learning_rate": 3.787628212202367e-05, + "loss": 2.9306, + "step": 2340000 + }, + { + "epoch": 0.727578504959067, + "grad_norm": 7.017319202423096, + "learning_rate": 3.7873691584015554e-05, + "loss": 2.907, + "step": 2340500 + }, + { + "epoch": 0.7277339372395538, + "grad_norm": 9.79653263092041, + "learning_rate": 3.787110104600744e-05, + "loss": 2.9143, + "step": 2341000 + }, + { + "epoch": 0.7278893695200407, + "grad_norm": 9.554059982299805, + "learning_rate": 3.786851050799932e-05, + "loss": 2.9425, + "step": 2341500 + }, + { + "epoch": 0.7280448018005276, + "grad_norm": 9.717782020568848, + "learning_rate": 3.786591996999121e-05, + "loss": 2.907, + "step": 2342000 + }, + { + "epoch": 0.7282002340810144, + "grad_norm": 12.2373046875, + "learning_rate": 3.7863329431983096e-05, + "loss": 2.9184, + "step": 2342500 + }, + { + "epoch": 0.7283556663615013, + "grad_norm": 8.470673561096191, + "learning_rate": 3.786073889397498e-05, + "loss": 2.9759, + "step": 2343000 + }, + { + "epoch": 0.7285110986419882, + "grad_norm": 14.083026885986328, + "learning_rate": 3.7858148355966864e-05, + "loss": 2.9133, + "step": 2343500 + }, + { + "epoch": 0.728666530922475, + "grad_norm": 9.7908296585083, + "learning_rate": 3.785555781795875e-05, + "loss": 2.884, + "step": 2344000 + }, + { + "epoch": 0.7288219632029619, + "grad_norm": 24.529882431030273, + "learning_rate": 3.785296727995064e-05, + "loss": 2.9324, + "step": 2344500 + }, + { + "epoch": 0.7289773954834488, + "grad_norm": 8.305882453918457, + "learning_rate": 3.7850376741942525e-05, + "loss": 2.9183, + "step": 2345000 + }, + { + "epoch": 0.7291328277639356, + "grad_norm": 7.934983253479004, + "learning_rate": 3.784778620393441e-05, + "loss": 2.9374, + "step": 2345500 + }, + { + "epoch": 0.7292882600444226, + "grad_norm": 7.78776216506958, + "learning_rate": 3.784519566592629e-05, + "loss": 2.8936, + "step": 2346000 + }, + { + "epoch": 0.7294436923249095, + "grad_norm": 8.070862770080566, + "learning_rate": 3.784260512791818e-05, + "loss": 2.9368, + "step": 2346500 + }, + { + "epoch": 0.7295991246053963, + "grad_norm": 43.7829475402832, + "learning_rate": 3.784001458991006e-05, + "loss": 2.9008, + "step": 2347000 + }, + { + "epoch": 0.7297545568858832, + "grad_norm": 8.341249465942383, + "learning_rate": 3.783742405190195e-05, + "loss": 2.9283, + "step": 2347500 + }, + { + "epoch": 0.7299099891663701, + "grad_norm": 7.995180130004883, + "learning_rate": 3.7834833513893835e-05, + "loss": 2.8691, + "step": 2348000 + }, + { + "epoch": 0.7300654214468569, + "grad_norm": 19.393936157226562, + "learning_rate": 3.7832242975885715e-05, + "loss": 2.9367, + "step": 2348500 + }, + { + "epoch": 0.7302208537273438, + "grad_norm": 10.224393844604492, + "learning_rate": 3.78296524378776e-05, + "loss": 2.9616, + "step": 2349000 + }, + { + "epoch": 0.7303762860078307, + "grad_norm": 23.879751205444336, + "learning_rate": 3.782706189986949e-05, + "loss": 2.9089, + "step": 2349500 + }, + { + "epoch": 0.7305317182883175, + "grad_norm": 8.228529930114746, + "learning_rate": 3.7824471361861377e-05, + "loss": 2.8755, + "step": 2350000 + }, + { + "epoch": 0.7306871505688044, + "grad_norm": 8.636054992675781, + "learning_rate": 3.7821880823853264e-05, + "loss": 2.9563, + "step": 2350500 + }, + { + "epoch": 0.7308425828492913, + "grad_norm": 10.349291801452637, + "learning_rate": 3.781929028584515e-05, + "loss": 2.9427, + "step": 2351000 + }, + { + "epoch": 0.7309980151297781, + "grad_norm": 7.75363302230835, + "learning_rate": 3.781669974783703e-05, + "loss": 2.9509, + "step": 2351500 + }, + { + "epoch": 0.7311534474102651, + "grad_norm": 9.506147384643555, + "learning_rate": 3.781410920982892e-05, + "loss": 2.9677, + "step": 2352000 + }, + { + "epoch": 0.731308879690752, + "grad_norm": 10.543326377868652, + "learning_rate": 3.78115186718208e-05, + "loss": 2.9334, + "step": 2352500 + }, + { + "epoch": 0.7314643119712388, + "grad_norm": 10.312222480773926, + "learning_rate": 3.7808928133812686e-05, + "loss": 2.9353, + "step": 2353000 + }, + { + "epoch": 0.7316197442517257, + "grad_norm": 7.381762504577637, + "learning_rate": 3.780633759580457e-05, + "loss": 2.9128, + "step": 2353500 + }, + { + "epoch": 0.7317751765322126, + "grad_norm": 8.761067390441895, + "learning_rate": 3.780374705779646e-05, + "loss": 2.9529, + "step": 2354000 + }, + { + "epoch": 0.7319306088126994, + "grad_norm": 29.767351150512695, + "learning_rate": 3.780115651978835e-05, + "loss": 2.9536, + "step": 2354500 + }, + { + "epoch": 0.7320860410931863, + "grad_norm": 6.85185432434082, + "learning_rate": 3.7798565981780234e-05, + "loss": 2.93, + "step": 2355000 + }, + { + "epoch": 0.7322414733736732, + "grad_norm": 6.064821243286133, + "learning_rate": 3.7795975443772115e-05, + "loss": 2.9031, + "step": 2355500 + }, + { + "epoch": 0.73239690565416, + "grad_norm": 16.97470474243164, + "learning_rate": 3.7793384905764e-05, + "loss": 2.9527, + "step": 2356000 + }, + { + "epoch": 0.7325523379346469, + "grad_norm": 8.61441707611084, + "learning_rate": 3.779079436775589e-05, + "loss": 2.9135, + "step": 2356500 + }, + { + "epoch": 0.7327077702151338, + "grad_norm": 15.449618339538574, + "learning_rate": 3.778820382974777e-05, + "loss": 2.9183, + "step": 2357000 + }, + { + "epoch": 0.7328632024956206, + "grad_norm": 8.52094841003418, + "learning_rate": 3.778561329173966e-05, + "loss": 2.9443, + "step": 2357500 + }, + { + "epoch": 0.7330186347761076, + "grad_norm": 9.431486129760742, + "learning_rate": 3.778302275373154e-05, + "loss": 2.9148, + "step": 2358000 + }, + { + "epoch": 0.7331740670565945, + "grad_norm": 10.864991188049316, + "learning_rate": 3.7780432215723424e-05, + "loss": 2.9047, + "step": 2358500 + }, + { + "epoch": 0.7333294993370814, + "grad_norm": 84.5085220336914, + "learning_rate": 3.777784167771531e-05, + "loss": 2.995, + "step": 2359000 + }, + { + "epoch": 0.7334849316175682, + "grad_norm": 7.95982551574707, + "learning_rate": 3.77752511397072e-05, + "loss": 2.9005, + "step": 2359500 + }, + { + "epoch": 0.7336403638980551, + "grad_norm": 10.535283088684082, + "learning_rate": 3.7772660601699086e-05, + "loss": 2.9149, + "step": 2360000 + }, + { + "epoch": 0.733795796178542, + "grad_norm": 9.977761268615723, + "learning_rate": 3.777007006369097e-05, + "loss": 2.9514, + "step": 2360500 + }, + { + "epoch": 0.7339512284590288, + "grad_norm": 9.277265548706055, + "learning_rate": 3.776747952568285e-05, + "loss": 2.9248, + "step": 2361000 + }, + { + "epoch": 0.7341066607395157, + "grad_norm": 6.113075256347656, + "learning_rate": 3.776488898767474e-05, + "loss": 2.8852, + "step": 2361500 + }, + { + "epoch": 0.7342620930200026, + "grad_norm": 8.045987129211426, + "learning_rate": 3.776229844966663e-05, + "loss": 2.8667, + "step": 2362000 + }, + { + "epoch": 0.7344175253004894, + "grad_norm": 8.485671997070312, + "learning_rate": 3.775970791165851e-05, + "loss": 2.9493, + "step": 2362500 + }, + { + "epoch": 0.7345729575809763, + "grad_norm": 10.909662246704102, + "learning_rate": 3.7757117373650395e-05, + "loss": 2.9166, + "step": 2363000 + }, + { + "epoch": 0.7347283898614632, + "grad_norm": 8.27088451385498, + "learning_rate": 3.775452683564228e-05, + "loss": 2.8625, + "step": 2363500 + }, + { + "epoch": 0.7348838221419501, + "grad_norm": 9.608234405517578, + "learning_rate": 3.775193629763417e-05, + "loss": 2.9013, + "step": 2364000 + }, + { + "epoch": 0.735039254422437, + "grad_norm": 7.6113481521606445, + "learning_rate": 3.7749345759626057e-05, + "loss": 2.9181, + "step": 2364500 + }, + { + "epoch": 0.7351946867029239, + "grad_norm": 8.94339370727539, + "learning_rate": 3.774675522161794e-05, + "loss": 2.945, + "step": 2365000 + }, + { + "epoch": 0.7353501189834107, + "grad_norm": 9.073722839355469, + "learning_rate": 3.7744164683609824e-05, + "loss": 2.9504, + "step": 2365500 + }, + { + "epoch": 0.7355055512638976, + "grad_norm": 12.093520164489746, + "learning_rate": 3.774157414560171e-05, + "loss": 2.9227, + "step": 2366000 + }, + { + "epoch": 0.7356609835443845, + "grad_norm": 9.81733226776123, + "learning_rate": 3.773898360759359e-05, + "loss": 2.9075, + "step": 2366500 + }, + { + "epoch": 0.7358164158248713, + "grad_norm": 8.43805980682373, + "learning_rate": 3.773639306958548e-05, + "loss": 3.0385, + "step": 2367000 + }, + { + "epoch": 0.7359718481053582, + "grad_norm": 8.624878883361816, + "learning_rate": 3.7733802531577366e-05, + "loss": 2.9036, + "step": 2367500 + }, + { + "epoch": 0.7361272803858451, + "grad_norm": 9.158726692199707, + "learning_rate": 3.7731211993569246e-05, + "loss": 2.9136, + "step": 2368000 + }, + { + "epoch": 0.7362827126663319, + "grad_norm": 7.115110397338867, + "learning_rate": 3.7728621455561133e-05, + "loss": 2.9408, + "step": 2368500 + }, + { + "epoch": 0.7364381449468188, + "grad_norm": 12.009929656982422, + "learning_rate": 3.772603091755302e-05, + "loss": 2.9687, + "step": 2369000 + }, + { + "epoch": 0.7365935772273057, + "grad_norm": 9.382519721984863, + "learning_rate": 3.772344037954491e-05, + "loss": 2.8827, + "step": 2369500 + }, + { + "epoch": 0.7367490095077927, + "grad_norm": 8.889625549316406, + "learning_rate": 3.7720849841536795e-05, + "loss": 2.9475, + "step": 2370000 + }, + { + "epoch": 0.7369044417882795, + "grad_norm": 8.955400466918945, + "learning_rate": 3.7718259303528675e-05, + "loss": 2.9062, + "step": 2370500 + }, + { + "epoch": 0.7370598740687664, + "grad_norm": 6.146303653717041, + "learning_rate": 3.771566876552056e-05, + "loss": 2.91, + "step": 2371000 + }, + { + "epoch": 0.7372153063492533, + "grad_norm": 8.613919258117676, + "learning_rate": 3.771307822751245e-05, + "loss": 2.9199, + "step": 2371500 + }, + { + "epoch": 0.7373707386297401, + "grad_norm": 7.530971527099609, + "learning_rate": 3.771048768950433e-05, + "loss": 2.8991, + "step": 2372000 + }, + { + "epoch": 0.737526170910227, + "grad_norm": 7.964534282684326, + "learning_rate": 3.770789715149622e-05, + "loss": 2.9182, + "step": 2372500 + }, + { + "epoch": 0.7376816031907139, + "grad_norm": 15.674410820007324, + "learning_rate": 3.7705306613488104e-05, + "loss": 2.9262, + "step": 2373000 + }, + { + "epoch": 0.7378370354712007, + "grad_norm": 11.374677658081055, + "learning_rate": 3.7702716075479985e-05, + "loss": 2.9097, + "step": 2373500 + }, + { + "epoch": 0.7379924677516876, + "grad_norm": 9.613028526306152, + "learning_rate": 3.770012553747188e-05, + "loss": 2.9462, + "step": 2374000 + }, + { + "epoch": 0.7381479000321745, + "grad_norm": 10.486263275146484, + "learning_rate": 3.7697534999463766e-05, + "loss": 2.9209, + "step": 2374500 + }, + { + "epoch": 0.7383033323126613, + "grad_norm": 6.872437477111816, + "learning_rate": 3.7694944461455646e-05, + "loss": 2.9427, + "step": 2375000 + }, + { + "epoch": 0.7384587645931482, + "grad_norm": 8.345071792602539, + "learning_rate": 3.769235392344753e-05, + "loss": 2.9529, + "step": 2375500 + }, + { + "epoch": 0.7386141968736352, + "grad_norm": 8.025613784790039, + "learning_rate": 3.7689763385439414e-05, + "loss": 2.9016, + "step": 2376000 + }, + { + "epoch": 0.738769629154122, + "grad_norm": 8.331981658935547, + "learning_rate": 3.76871728474313e-05, + "loss": 2.9429, + "step": 2376500 + }, + { + "epoch": 0.7389250614346089, + "grad_norm": 7.258233547210693, + "learning_rate": 3.768458230942319e-05, + "loss": 2.9477, + "step": 2377000 + }, + { + "epoch": 0.7390804937150958, + "grad_norm": 9.312272071838379, + "learning_rate": 3.768199177141507e-05, + "loss": 2.9546, + "step": 2377500 + }, + { + "epoch": 0.7392359259955826, + "grad_norm": 7.424601078033447, + "learning_rate": 3.7679401233406955e-05, + "loss": 2.9413, + "step": 2378000 + }, + { + "epoch": 0.7393913582760695, + "grad_norm": 9.546066284179688, + "learning_rate": 3.767681069539884e-05, + "loss": 2.8968, + "step": 2378500 + }, + { + "epoch": 0.7395467905565564, + "grad_norm": 8.547517776489258, + "learning_rate": 3.767422015739073e-05, + "loss": 2.886, + "step": 2379000 + }, + { + "epoch": 0.7397022228370432, + "grad_norm": 9.9844970703125, + "learning_rate": 3.767162961938262e-05, + "loss": 2.8786, + "step": 2379500 + }, + { + "epoch": 0.7398576551175301, + "grad_norm": 7.7369279861450195, + "learning_rate": 3.7669039081374504e-05, + "loss": 2.9264, + "step": 2380000 + }, + { + "epoch": 0.740013087398017, + "grad_norm": 15.67415714263916, + "learning_rate": 3.7666448543366384e-05, + "loss": 2.906, + "step": 2380500 + }, + { + "epoch": 0.7401685196785038, + "grad_norm": 13.257431030273438, + "learning_rate": 3.766385800535827e-05, + "loss": 2.9589, + "step": 2381000 + }, + { + "epoch": 0.7403239519589907, + "grad_norm": 10.047365188598633, + "learning_rate": 3.766126746735016e-05, + "loss": 2.9288, + "step": 2381500 + }, + { + "epoch": 0.7404793842394777, + "grad_norm": 10.243573188781738, + "learning_rate": 3.765867692934204e-05, + "loss": 2.9278, + "step": 2382000 + }, + { + "epoch": 0.7406348165199645, + "grad_norm": 33.77231979370117, + "learning_rate": 3.7656086391333926e-05, + "loss": 2.9239, + "step": 2382500 + }, + { + "epoch": 0.7407902488004514, + "grad_norm": 10.291876792907715, + "learning_rate": 3.765349585332581e-05, + "loss": 2.985, + "step": 2383000 + }, + { + "epoch": 0.7409456810809383, + "grad_norm": 10.537397384643555, + "learning_rate": 3.7650905315317694e-05, + "loss": 2.9275, + "step": 2383500 + }, + { + "epoch": 0.7411011133614251, + "grad_norm": 12.683045387268066, + "learning_rate": 3.764831477730959e-05, + "loss": 2.99, + "step": 2384000 + }, + { + "epoch": 0.741256545641912, + "grad_norm": 7.220395565032959, + "learning_rate": 3.764572423930147e-05, + "loss": 2.938, + "step": 2384500 + }, + { + "epoch": 0.7414119779223989, + "grad_norm": 8.75603199005127, + "learning_rate": 3.7643133701293355e-05, + "loss": 2.9149, + "step": 2385000 + }, + { + "epoch": 0.7415674102028857, + "grad_norm": 16.455181121826172, + "learning_rate": 3.764054316328524e-05, + "loss": 2.9488, + "step": 2385500 + }, + { + "epoch": 0.7417228424833726, + "grad_norm": 6.809926986694336, + "learning_rate": 3.763795262527712e-05, + "loss": 2.9181, + "step": 2386000 + }, + { + "epoch": 0.7418782747638595, + "grad_norm": 8.191847801208496, + "learning_rate": 3.763536208726901e-05, + "loss": 2.8991, + "step": 2386500 + }, + { + "epoch": 0.7420337070443463, + "grad_norm": 8.414304733276367, + "learning_rate": 3.76327715492609e-05, + "loss": 2.9511, + "step": 2387000 + }, + { + "epoch": 0.7421891393248332, + "grad_norm": 9.024744033813477, + "learning_rate": 3.763018101125278e-05, + "loss": 2.9384, + "step": 2387500 + }, + { + "epoch": 0.7423445716053202, + "grad_norm": 8.020288467407227, + "learning_rate": 3.7627590473244665e-05, + "loss": 2.9508, + "step": 2388000 + }, + { + "epoch": 0.7425000038858071, + "grad_norm": 7.466517448425293, + "learning_rate": 3.762499993523655e-05, + "loss": 2.9325, + "step": 2388500 + }, + { + "epoch": 0.7426554361662939, + "grad_norm": 8.519065856933594, + "learning_rate": 3.762240939722844e-05, + "loss": 2.9526, + "step": 2389000 + }, + { + "epoch": 0.7428108684467808, + "grad_norm": 7.274235725402832, + "learning_rate": 3.7619818859220326e-05, + "loss": 2.9185, + "step": 2389500 + }, + { + "epoch": 0.7429663007272677, + "grad_norm": 9.980324745178223, + "learning_rate": 3.7617228321212206e-05, + "loss": 2.9046, + "step": 2390000 + }, + { + "epoch": 0.7431217330077545, + "grad_norm": 14.862370491027832, + "learning_rate": 3.7614637783204094e-05, + "loss": 2.9092, + "step": 2390500 + }, + { + "epoch": 0.7432771652882414, + "grad_norm": 30.023975372314453, + "learning_rate": 3.761204724519598e-05, + "loss": 2.9496, + "step": 2391000 + }, + { + "epoch": 0.7434325975687283, + "grad_norm": 6.549309730529785, + "learning_rate": 3.760945670718786e-05, + "loss": 2.9023, + "step": 2391500 + }, + { + "epoch": 0.7435880298492151, + "grad_norm": 8.367191314697266, + "learning_rate": 3.760686616917975e-05, + "loss": 2.9205, + "step": 2392000 + }, + { + "epoch": 0.743743462129702, + "grad_norm": 7.612793445587158, + "learning_rate": 3.7604275631171635e-05, + "loss": 2.965, + "step": 2392500 + }, + { + "epoch": 0.7438988944101889, + "grad_norm": 10.997359275817871, + "learning_rate": 3.7601685093163516e-05, + "loss": 2.8912, + "step": 2393000 + }, + { + "epoch": 0.7440543266906757, + "grad_norm": 8.059895515441895, + "learning_rate": 3.75990945551554e-05, + "loss": 2.9185, + "step": 2393500 + }, + { + "epoch": 0.7442097589711626, + "grad_norm": 8.135124206542969, + "learning_rate": 3.75965040171473e-05, + "loss": 2.9143, + "step": 2394000 + }, + { + "epoch": 0.7443651912516496, + "grad_norm": 9.349045753479004, + "learning_rate": 3.759391347913918e-05, + "loss": 2.9492, + "step": 2394500 + }, + { + "epoch": 0.7445206235321364, + "grad_norm": 7.810096740722656, + "learning_rate": 3.7591322941131064e-05, + "loss": 2.9618, + "step": 2395000 + }, + { + "epoch": 0.7446760558126233, + "grad_norm": 7.926739692687988, + "learning_rate": 3.7588732403122945e-05, + "loss": 2.8917, + "step": 2395500 + }, + { + "epoch": 0.7448314880931102, + "grad_norm": 9.392163276672363, + "learning_rate": 3.758614186511483e-05, + "loss": 2.9505, + "step": 2396000 + }, + { + "epoch": 0.744986920373597, + "grad_norm": 8.43094253540039, + "learning_rate": 3.758355132710672e-05, + "loss": 2.9353, + "step": 2396500 + }, + { + "epoch": 0.7451423526540839, + "grad_norm": 8.181268692016602, + "learning_rate": 3.75809607890986e-05, + "loss": 2.9291, + "step": 2397000 + }, + { + "epoch": 0.7452977849345708, + "grad_norm": 8.116567611694336, + "learning_rate": 3.757837025109049e-05, + "loss": 2.9202, + "step": 2397500 + }, + { + "epoch": 0.7454532172150576, + "grad_norm": 7.749307632446289, + "learning_rate": 3.7575779713082374e-05, + "loss": 2.9114, + "step": 2398000 + }, + { + "epoch": 0.7456086494955445, + "grad_norm": 9.607280731201172, + "learning_rate": 3.757318917507426e-05, + "loss": 2.9406, + "step": 2398500 + }, + { + "epoch": 0.7457640817760314, + "grad_norm": 8.495089530944824, + "learning_rate": 3.757059863706615e-05, + "loss": 2.9209, + "step": 2399000 + }, + { + "epoch": 0.7459195140565182, + "grad_norm": 8.848071098327637, + "learning_rate": 3.7568008099058035e-05, + "loss": 2.9223, + "step": 2399500 + }, + { + "epoch": 0.7460749463370051, + "grad_norm": 7.81467342376709, + "learning_rate": 3.7565417561049916e-05, + "loss": 2.9162, + "step": 2400000 + }, + { + "epoch": 0.7462303786174921, + "grad_norm": 9.230598449707031, + "learning_rate": 3.75628270230418e-05, + "loss": 2.9119, + "step": 2400500 + }, + { + "epoch": 0.746385810897979, + "grad_norm": 8.323214530944824, + "learning_rate": 3.756023648503368e-05, + "loss": 2.9666, + "step": 2401000 + }, + { + "epoch": 0.7465412431784658, + "grad_norm": 15.290433883666992, + "learning_rate": 3.755764594702557e-05, + "loss": 2.9091, + "step": 2401500 + }, + { + "epoch": 0.7466966754589527, + "grad_norm": 6.112323760986328, + "learning_rate": 3.755505540901746e-05, + "loss": 2.9536, + "step": 2402000 + }, + { + "epoch": 0.7468521077394396, + "grad_norm": 9.539912223815918, + "learning_rate": 3.755246487100934e-05, + "loss": 2.9273, + "step": 2402500 + }, + { + "epoch": 0.7470075400199264, + "grad_norm": 8.406208038330078, + "learning_rate": 3.7549874333001225e-05, + "loss": 2.9571, + "step": 2403000 + }, + { + "epoch": 0.7471629723004133, + "grad_norm": 8.556228637695312, + "learning_rate": 3.754728379499311e-05, + "loss": 2.9544, + "step": 2403500 + }, + { + "epoch": 0.7473184045809002, + "grad_norm": 7.344978332519531, + "learning_rate": 3.7544693256985e-05, + "loss": 2.8998, + "step": 2404000 + }, + { + "epoch": 0.747473836861387, + "grad_norm": 8.346508979797363, + "learning_rate": 3.7542102718976886e-05, + "loss": 2.8684, + "step": 2404500 + }, + { + "epoch": 0.7476292691418739, + "grad_norm": 7.210219383239746, + "learning_rate": 3.7539512180968774e-05, + "loss": 2.9387, + "step": 2405000 + }, + { + "epoch": 0.7477847014223608, + "grad_norm": 9.05495548248291, + "learning_rate": 3.7536921642960654e-05, + "loss": 2.93, + "step": 2405500 + }, + { + "epoch": 0.7479401337028476, + "grad_norm": 6.441583156585693, + "learning_rate": 3.753433110495254e-05, + "loss": 2.9577, + "step": 2406000 + }, + { + "epoch": 0.7480955659833346, + "grad_norm": 23.127168655395508, + "learning_rate": 3.753174056694442e-05, + "loss": 2.9148, + "step": 2406500 + }, + { + "epoch": 0.7482509982638215, + "grad_norm": 8.091024398803711, + "learning_rate": 3.752915002893631e-05, + "loss": 2.9167, + "step": 2407000 + }, + { + "epoch": 0.7484064305443083, + "grad_norm": 8.967491149902344, + "learning_rate": 3.7526559490928196e-05, + "loss": 2.9225, + "step": 2407500 + }, + { + "epoch": 0.7485618628247952, + "grad_norm": 8.43486213684082, + "learning_rate": 3.752396895292008e-05, + "loss": 2.9813, + "step": 2408000 + }, + { + "epoch": 0.7487172951052821, + "grad_norm": 7.999169826507568, + "learning_rate": 3.752137841491197e-05, + "loss": 2.917, + "step": 2408500 + }, + { + "epoch": 0.7488727273857689, + "grad_norm": 9.931756019592285, + "learning_rate": 3.751878787690386e-05, + "loss": 2.8921, + "step": 2409000 + }, + { + "epoch": 0.7490281596662558, + "grad_norm": 9.628164291381836, + "learning_rate": 3.751619733889574e-05, + "loss": 2.9513, + "step": 2409500 + }, + { + "epoch": 0.7491835919467427, + "grad_norm": 10.732168197631836, + "learning_rate": 3.7513606800887625e-05, + "loss": 2.9488, + "step": 2410000 + }, + { + "epoch": 0.7493390242272295, + "grad_norm": 10.6213960647583, + "learning_rate": 3.751101626287951e-05, + "loss": 2.8779, + "step": 2410500 + }, + { + "epoch": 0.7494944565077164, + "grad_norm": 10.010478019714355, + "learning_rate": 3.750842572487139e-05, + "loss": 2.93, + "step": 2411000 + }, + { + "epoch": 0.7496498887882033, + "grad_norm": 9.744873046875, + "learning_rate": 3.750583518686328e-05, + "loss": 2.9348, + "step": 2411500 + }, + { + "epoch": 0.7498053210686901, + "grad_norm": 6.850436687469482, + "learning_rate": 3.750324464885517e-05, + "loss": 2.9116, + "step": 2412000 + }, + { + "epoch": 0.7499607533491771, + "grad_norm": 9.060264587402344, + "learning_rate": 3.750065411084705e-05, + "loss": 2.9105, + "step": 2412500 + }, + { + "epoch": 0.750116185629664, + "grad_norm": 9.236757278442383, + "learning_rate": 3.7498063572838934e-05, + "loss": 2.9564, + "step": 2413000 + }, + { + "epoch": 0.7502716179101508, + "grad_norm": 8.219728469848633, + "learning_rate": 3.749547303483082e-05, + "loss": 2.8875, + "step": 2413500 + }, + { + "epoch": 0.7504270501906377, + "grad_norm": 7.537879943847656, + "learning_rate": 3.749288249682271e-05, + "loss": 2.8774, + "step": 2414000 + }, + { + "epoch": 0.7505824824711246, + "grad_norm": 8.212201118469238, + "learning_rate": 3.7490291958814596e-05, + "loss": 2.9387, + "step": 2414500 + }, + { + "epoch": 0.7507379147516114, + "grad_norm": 8.997919082641602, + "learning_rate": 3.7487701420806476e-05, + "loss": 2.9345, + "step": 2415000 + }, + { + "epoch": 0.7508933470320983, + "grad_norm": 10.202757835388184, + "learning_rate": 3.748511088279836e-05, + "loss": 2.9366, + "step": 2415500 + }, + { + "epoch": 0.7510487793125852, + "grad_norm": 9.031081199645996, + "learning_rate": 3.748252034479025e-05, + "loss": 2.9659, + "step": 2416000 + }, + { + "epoch": 0.751204211593072, + "grad_norm": 8.767247200012207, + "learning_rate": 3.747992980678213e-05, + "loss": 2.9427, + "step": 2416500 + }, + { + "epoch": 0.7513596438735589, + "grad_norm": 18.89099884033203, + "learning_rate": 3.747733926877402e-05, + "loss": 2.9753, + "step": 2417000 + }, + { + "epoch": 0.7515150761540458, + "grad_norm": 7.435710430145264, + "learning_rate": 3.7474748730765905e-05, + "loss": 2.9285, + "step": 2417500 + }, + { + "epoch": 0.7516705084345326, + "grad_norm": 12.602779388427734, + "learning_rate": 3.747215819275779e-05, + "loss": 2.9398, + "step": 2418000 + }, + { + "epoch": 0.7518259407150196, + "grad_norm": 9.55699634552002, + "learning_rate": 3.746956765474968e-05, + "loss": 2.9282, + "step": 2418500 + }, + { + "epoch": 0.7519813729955065, + "grad_norm": 11.02229118347168, + "learning_rate": 3.746697711674156e-05, + "loss": 2.9222, + "step": 2419000 + }, + { + "epoch": 0.7521368052759934, + "grad_norm": 12.496036529541016, + "learning_rate": 3.746438657873345e-05, + "loss": 2.9542, + "step": 2419500 + }, + { + "epoch": 0.7522922375564802, + "grad_norm": 10.779166221618652, + "learning_rate": 3.7461796040725334e-05, + "loss": 2.9461, + "step": 2420000 + }, + { + "epoch": 0.7524476698369671, + "grad_norm": 9.669134140014648, + "learning_rate": 3.7459205502717214e-05, + "loss": 2.9969, + "step": 2420500 + }, + { + "epoch": 0.752603102117454, + "grad_norm": 12.250530242919922, + "learning_rate": 3.74566149647091e-05, + "loss": 2.9181, + "step": 2421000 + }, + { + "epoch": 0.7527585343979408, + "grad_norm": 9.325289726257324, + "learning_rate": 3.745402442670099e-05, + "loss": 2.9047, + "step": 2421500 + }, + { + "epoch": 0.7529139666784277, + "grad_norm": 8.386984825134277, + "learning_rate": 3.745143388869287e-05, + "loss": 2.9474, + "step": 2422000 + }, + { + "epoch": 0.7530693989589146, + "grad_norm": 8.102294921875, + "learning_rate": 3.7448843350684756e-05, + "loss": 2.9292, + "step": 2422500 + }, + { + "epoch": 0.7532248312394014, + "grad_norm": 8.472429275512695, + "learning_rate": 3.7446252812676643e-05, + "loss": 2.9171, + "step": 2423000 + }, + { + "epoch": 0.7533802635198883, + "grad_norm": 10.393924713134766, + "learning_rate": 3.744366227466853e-05, + "loss": 2.9548, + "step": 2423500 + }, + { + "epoch": 0.7535356958003752, + "grad_norm": 8.921616554260254, + "learning_rate": 3.744107173666042e-05, + "loss": 2.9115, + "step": 2424000 + }, + { + "epoch": 0.7536911280808621, + "grad_norm": 7.821990013122559, + "learning_rate": 3.74384811986523e-05, + "loss": 2.8971, + "step": 2424500 + }, + { + "epoch": 0.753846560361349, + "grad_norm": 9.081876754760742, + "learning_rate": 3.7435890660644185e-05, + "loss": 2.9467, + "step": 2425000 + }, + { + "epoch": 0.7540019926418359, + "grad_norm": 8.06319522857666, + "learning_rate": 3.743330012263607e-05, + "loss": 2.9381, + "step": 2425500 + }, + { + "epoch": 0.7541574249223227, + "grad_norm": 5.128745079040527, + "learning_rate": 3.743070958462795e-05, + "loss": 2.9219, + "step": 2426000 + }, + { + "epoch": 0.7543128572028096, + "grad_norm": 10.389911651611328, + "learning_rate": 3.742811904661984e-05, + "loss": 2.9454, + "step": 2426500 + }, + { + "epoch": 0.7544682894832965, + "grad_norm": 20.17829132080078, + "learning_rate": 3.742552850861173e-05, + "loss": 2.9883, + "step": 2427000 + }, + { + "epoch": 0.7546237217637833, + "grad_norm": 9.811346054077148, + "learning_rate": 3.7422937970603614e-05, + "loss": 2.9435, + "step": 2427500 + }, + { + "epoch": 0.7547791540442702, + "grad_norm": 11.322876930236816, + "learning_rate": 3.74203474325955e-05, + "loss": 2.935, + "step": 2428000 + }, + { + "epoch": 0.7549345863247571, + "grad_norm": 9.085777282714844, + "learning_rate": 3.741775689458739e-05, + "loss": 2.9284, + "step": 2428500 + }, + { + "epoch": 0.7550900186052439, + "grad_norm": 10.793277740478516, + "learning_rate": 3.741516635657927e-05, + "loss": 2.9403, + "step": 2429000 + }, + { + "epoch": 0.7552454508857308, + "grad_norm": 7.865653038024902, + "learning_rate": 3.7412575818571156e-05, + "loss": 2.8668, + "step": 2429500 + }, + { + "epoch": 0.7554008831662177, + "grad_norm": 8.981369018554688, + "learning_rate": 3.740998528056304e-05, + "loss": 2.9354, + "step": 2430000 + }, + { + "epoch": 0.7555563154467047, + "grad_norm": 9.209405899047852, + "learning_rate": 3.7407394742554924e-05, + "loss": 2.9385, + "step": 2430500 + }, + { + "epoch": 0.7557117477271915, + "grad_norm": 9.51364517211914, + "learning_rate": 3.740480420454681e-05, + "loss": 2.9138, + "step": 2431000 + }, + { + "epoch": 0.7558671800076784, + "grad_norm": 9.010148048400879, + "learning_rate": 3.740221366653869e-05, + "loss": 2.9317, + "step": 2431500 + }, + { + "epoch": 0.7560226122881653, + "grad_norm": 8.64733600616455, + "learning_rate": 3.739962312853058e-05, + "loss": 2.8698, + "step": 2432000 + }, + { + "epoch": 0.7561780445686521, + "grad_norm": 11.201050758361816, + "learning_rate": 3.7397032590522465e-05, + "loss": 2.9816, + "step": 2432500 + }, + { + "epoch": 0.756333476849139, + "grad_norm": 10.73585319519043, + "learning_rate": 3.739444205251435e-05, + "loss": 2.8761, + "step": 2433000 + }, + { + "epoch": 0.7564889091296259, + "grad_norm": 11.906293869018555, + "learning_rate": 3.739185151450624e-05, + "loss": 2.9156, + "step": 2433500 + }, + { + "epoch": 0.7566443414101127, + "grad_norm": 9.037018775939941, + "learning_rate": 3.738926097649813e-05, + "loss": 2.9054, + "step": 2434000 + }, + { + "epoch": 0.7567997736905996, + "grad_norm": 8.279939651489258, + "learning_rate": 3.738667043849001e-05, + "loss": 2.9419, + "step": 2434500 + }, + { + "epoch": 0.7569552059710865, + "grad_norm": 6.087986946105957, + "learning_rate": 3.7384079900481894e-05, + "loss": 2.9142, + "step": 2435000 + }, + { + "epoch": 0.7571106382515733, + "grad_norm": 7.675699234008789, + "learning_rate": 3.738148936247378e-05, + "loss": 2.9202, + "step": 2435500 + }, + { + "epoch": 0.7572660705320602, + "grad_norm": 11.147599220275879, + "learning_rate": 3.737889882446566e-05, + "loss": 2.9601, + "step": 2436000 + }, + { + "epoch": 0.7574215028125472, + "grad_norm": 7.6424665451049805, + "learning_rate": 3.737630828645755e-05, + "loss": 2.9237, + "step": 2436500 + }, + { + "epoch": 0.757576935093034, + "grad_norm": 8.134702682495117, + "learning_rate": 3.737371774844943e-05, + "loss": 2.9351, + "step": 2437000 + }, + { + "epoch": 0.7577323673735209, + "grad_norm": 10.282275199890137, + "learning_rate": 3.7371127210441323e-05, + "loss": 2.9185, + "step": 2437500 + }, + { + "epoch": 0.7578877996540078, + "grad_norm": 7.102168560028076, + "learning_rate": 3.736853667243321e-05, + "loss": 2.9163, + "step": 2438000 + }, + { + "epoch": 0.7580432319344946, + "grad_norm": 8.51026725769043, + "learning_rate": 3.736594613442509e-05, + "loss": 2.9059, + "step": 2438500 + }, + { + "epoch": 0.7581986642149815, + "grad_norm": 8.007821083068848, + "learning_rate": 3.736335559641698e-05, + "loss": 2.935, + "step": 2439000 + }, + { + "epoch": 0.7583540964954684, + "grad_norm": 8.161617279052734, + "learning_rate": 3.7360765058408865e-05, + "loss": 2.9181, + "step": 2439500 + }, + { + "epoch": 0.7585095287759552, + "grad_norm": 9.644336700439453, + "learning_rate": 3.7358174520400746e-05, + "loss": 2.9165, + "step": 2440000 + }, + { + "epoch": 0.7586649610564421, + "grad_norm": 17.269758224487305, + "learning_rate": 3.735558398239263e-05, + "loss": 2.8825, + "step": 2440500 + }, + { + "epoch": 0.758820393336929, + "grad_norm": 9.357958793640137, + "learning_rate": 3.735299344438452e-05, + "loss": 2.9122, + "step": 2441000 + }, + { + "epoch": 0.7589758256174158, + "grad_norm": 9.979338645935059, + "learning_rate": 3.73504029063764e-05, + "loss": 2.9825, + "step": 2441500 + }, + { + "epoch": 0.7591312578979027, + "grad_norm": 19.60955238342285, + "learning_rate": 3.734781236836829e-05, + "loss": 2.9211, + "step": 2442000 + }, + { + "epoch": 0.7592866901783897, + "grad_norm": 7.917449474334717, + "learning_rate": 3.7345221830360175e-05, + "loss": 2.9189, + "step": 2442500 + }, + { + "epoch": 0.7594421224588765, + "grad_norm": 19.95669174194336, + "learning_rate": 3.734263129235206e-05, + "loss": 2.9241, + "step": 2443000 + }, + { + "epoch": 0.7595975547393634, + "grad_norm": 23.609451293945312, + "learning_rate": 3.734004075434395e-05, + "loss": 2.9126, + "step": 2443500 + }, + { + "epoch": 0.7597529870198503, + "grad_norm": 20.0595645904541, + "learning_rate": 3.733745021633583e-05, + "loss": 2.8969, + "step": 2444000 + }, + { + "epoch": 0.7599084193003371, + "grad_norm": 10.219922065734863, + "learning_rate": 3.7334859678327716e-05, + "loss": 2.9249, + "step": 2444500 + }, + { + "epoch": 0.760063851580824, + "grad_norm": 10.981535911560059, + "learning_rate": 3.7332269140319604e-05, + "loss": 2.9052, + "step": 2445000 + }, + { + "epoch": 0.7602192838613109, + "grad_norm": 8.763209342956543, + "learning_rate": 3.7329678602311484e-05, + "loss": 2.8914, + "step": 2445500 + }, + { + "epoch": 0.7603747161417977, + "grad_norm": 15.890460968017578, + "learning_rate": 3.732708806430337e-05, + "loss": 2.9106, + "step": 2446000 + }, + { + "epoch": 0.7605301484222846, + "grad_norm": 7.216142177581787, + "learning_rate": 3.732449752629526e-05, + "loss": 2.9515, + "step": 2446500 + }, + { + "epoch": 0.7606855807027715, + "grad_norm": 7.604504108428955, + "learning_rate": 3.732190698828714e-05, + "loss": 2.9315, + "step": 2447000 + }, + { + "epoch": 0.7608410129832583, + "grad_norm": 8.351101875305176, + "learning_rate": 3.731931645027903e-05, + "loss": 2.9249, + "step": 2447500 + }, + { + "epoch": 0.7609964452637452, + "grad_norm": 8.52433967590332, + "learning_rate": 3.731672591227092e-05, + "loss": 2.9366, + "step": 2448000 + }, + { + "epoch": 0.7611518775442322, + "grad_norm": 11.102145195007324, + "learning_rate": 3.73141353742628e-05, + "loss": 2.8958, + "step": 2448500 + }, + { + "epoch": 0.7613073098247191, + "grad_norm": 8.494248390197754, + "learning_rate": 3.731154483625469e-05, + "loss": 2.8955, + "step": 2449000 + }, + { + "epoch": 0.7614627421052059, + "grad_norm": 9.600749969482422, + "learning_rate": 3.730895429824657e-05, + "loss": 2.9141, + "step": 2449500 + }, + { + "epoch": 0.7616181743856928, + "grad_norm": 7.731563568115234, + "learning_rate": 3.7306363760238455e-05, + "loss": 2.8814, + "step": 2450000 + }, + { + "epoch": 0.7617736066661797, + "grad_norm": 7.4107985496521, + "learning_rate": 3.730377322223034e-05, + "loss": 2.8986, + "step": 2450500 + }, + { + "epoch": 0.7619290389466665, + "grad_norm": 6.064044952392578, + "learning_rate": 3.730118268422222e-05, + "loss": 2.8956, + "step": 2451000 + }, + { + "epoch": 0.7620844712271534, + "grad_norm": 7.765089511871338, + "learning_rate": 3.729859214621411e-05, + "loss": 2.9435, + "step": 2451500 + }, + { + "epoch": 0.7622399035076403, + "grad_norm": 9.985433578491211, + "learning_rate": 3.7296001608206e-05, + "loss": 2.8951, + "step": 2452000 + }, + { + "epoch": 0.7623953357881271, + "grad_norm": 24.395994186401367, + "learning_rate": 3.7293411070197884e-05, + "loss": 2.9233, + "step": 2452500 + }, + { + "epoch": 0.762550768068614, + "grad_norm": 8.812500953674316, + "learning_rate": 3.729082053218977e-05, + "loss": 2.9456, + "step": 2453000 + }, + { + "epoch": 0.7627062003491009, + "grad_norm": 7.377281665802002, + "learning_rate": 3.728822999418166e-05, + "loss": 2.8789, + "step": 2453500 + }, + { + "epoch": 0.7628616326295877, + "grad_norm": 7.524111270904541, + "learning_rate": 3.728563945617354e-05, + "loss": 2.9023, + "step": 2454000 + }, + { + "epoch": 0.7630170649100747, + "grad_norm": 7.361031532287598, + "learning_rate": 3.7283048918165426e-05, + "loss": 2.9318, + "step": 2454500 + }, + { + "epoch": 0.7631724971905616, + "grad_norm": 10.224342346191406, + "learning_rate": 3.7280458380157306e-05, + "loss": 2.9336, + "step": 2455000 + }, + { + "epoch": 0.7633279294710484, + "grad_norm": 20.335458755493164, + "learning_rate": 3.727786784214919e-05, + "loss": 2.9086, + "step": 2455500 + }, + { + "epoch": 0.7634833617515353, + "grad_norm": 9.403778076171875, + "learning_rate": 3.727527730414108e-05, + "loss": 2.9176, + "step": 2456000 + }, + { + "epoch": 0.7636387940320222, + "grad_norm": 13.284541130065918, + "learning_rate": 3.727268676613296e-05, + "loss": 2.8977, + "step": 2456500 + }, + { + "epoch": 0.763794226312509, + "grad_norm": 8.875560760498047, + "learning_rate": 3.727009622812485e-05, + "loss": 2.8998, + "step": 2457000 + }, + { + "epoch": 0.7639496585929959, + "grad_norm": 8.531681060791016, + "learning_rate": 3.726750569011674e-05, + "loss": 2.9336, + "step": 2457500 + }, + { + "epoch": 0.7641050908734828, + "grad_norm": 7.717087745666504, + "learning_rate": 3.726491515210862e-05, + "loss": 2.8812, + "step": 2458000 + }, + { + "epoch": 0.7642605231539696, + "grad_norm": 7.088630676269531, + "learning_rate": 3.726232461410051e-05, + "loss": 2.9577, + "step": 2458500 + }, + { + "epoch": 0.7644159554344565, + "grad_norm": 9.215609550476074, + "learning_rate": 3.7259734076092396e-05, + "loss": 2.9036, + "step": 2459000 + }, + { + "epoch": 0.7645713877149434, + "grad_norm": 8.180221557617188, + "learning_rate": 3.725714353808428e-05, + "loss": 2.9198, + "step": 2459500 + }, + { + "epoch": 0.7647268199954302, + "grad_norm": 8.469444274902344, + "learning_rate": 3.7254553000076164e-05, + "loss": 2.8986, + "step": 2460000 + }, + { + "epoch": 0.7648822522759172, + "grad_norm": 8.017027854919434, + "learning_rate": 3.7251962462068044e-05, + "loss": 2.9161, + "step": 2460500 + }, + { + "epoch": 0.7650376845564041, + "grad_norm": 26.370656967163086, + "learning_rate": 3.724937192405993e-05, + "loss": 2.8795, + "step": 2461000 + }, + { + "epoch": 0.765193116836891, + "grad_norm": 8.570987701416016, + "learning_rate": 3.724678138605182e-05, + "loss": 2.8975, + "step": 2461500 + }, + { + "epoch": 0.7653485491173778, + "grad_norm": 10.242915153503418, + "learning_rate": 3.7244190848043706e-05, + "loss": 2.8958, + "step": 2462000 + }, + { + "epoch": 0.7655039813978647, + "grad_norm": 25.382089614868164, + "learning_rate": 3.724160031003559e-05, + "loss": 2.8864, + "step": 2462500 + }, + { + "epoch": 0.7656594136783516, + "grad_norm": 7.746616840362549, + "learning_rate": 3.723900977202748e-05, + "loss": 2.9559, + "step": 2463000 + }, + { + "epoch": 0.7658148459588384, + "grad_norm": 9.258628845214844, + "learning_rate": 3.723641923401936e-05, + "loss": 2.8699, + "step": 2463500 + }, + { + "epoch": 0.7659702782393253, + "grad_norm": 10.076728820800781, + "learning_rate": 3.723382869601125e-05, + "loss": 2.9207, + "step": 2464000 + }, + { + "epoch": 0.7661257105198122, + "grad_norm": 8.305550575256348, + "learning_rate": 3.7231238158003135e-05, + "loss": 2.9389, + "step": 2464500 + }, + { + "epoch": 0.766281142800299, + "grad_norm": 7.94599723815918, + "learning_rate": 3.7228647619995015e-05, + "loss": 2.9215, + "step": 2465000 + }, + { + "epoch": 0.7664365750807859, + "grad_norm": 7.9852423667907715, + "learning_rate": 3.72260570819869e-05, + "loss": 2.9027, + "step": 2465500 + }, + { + "epoch": 0.7665920073612728, + "grad_norm": 31.59520721435547, + "learning_rate": 3.722346654397879e-05, + "loss": 2.91, + "step": 2466000 + }, + { + "epoch": 0.7667474396417597, + "grad_norm": 8.290989875793457, + "learning_rate": 3.722087600597067e-05, + "loss": 2.9187, + "step": 2466500 + }, + { + "epoch": 0.7669028719222466, + "grad_norm": 8.745713233947754, + "learning_rate": 3.721828546796256e-05, + "loss": 2.9417, + "step": 2467000 + }, + { + "epoch": 0.7670583042027335, + "grad_norm": 21.179380416870117, + "learning_rate": 3.7215694929954444e-05, + "loss": 2.9145, + "step": 2467500 + }, + { + "epoch": 0.7672137364832203, + "grad_norm": 8.553051948547363, + "learning_rate": 3.721310439194633e-05, + "loss": 2.9045, + "step": 2468000 + }, + { + "epoch": 0.7673691687637072, + "grad_norm": 9.592580795288086, + "learning_rate": 3.721051385393822e-05, + "loss": 2.9391, + "step": 2468500 + }, + { + "epoch": 0.7675246010441941, + "grad_norm": 7.709449768066406, + "learning_rate": 3.72079233159301e-05, + "loss": 2.8993, + "step": 2469000 + }, + { + "epoch": 0.7676800333246809, + "grad_norm": 7.707231521606445, + "learning_rate": 3.7205332777921986e-05, + "loss": 2.9196, + "step": 2469500 + }, + { + "epoch": 0.7678354656051678, + "grad_norm": 23.401636123657227, + "learning_rate": 3.720274223991387e-05, + "loss": 2.8834, + "step": 2470000 + }, + { + "epoch": 0.7679908978856547, + "grad_norm": 11.589532852172852, + "learning_rate": 3.7200151701905754e-05, + "loss": 2.9181, + "step": 2470500 + }, + { + "epoch": 0.7681463301661415, + "grad_norm": 10.008474349975586, + "learning_rate": 3.719756116389764e-05, + "loss": 2.9164, + "step": 2471000 + }, + { + "epoch": 0.7683017624466284, + "grad_norm": 6.503916263580322, + "learning_rate": 3.719497062588953e-05, + "loss": 2.9288, + "step": 2471500 + }, + { + "epoch": 0.7684571947271153, + "grad_norm": 7.692534446716309, + "learning_rate": 3.7192380087881415e-05, + "loss": 2.8497, + "step": 2472000 + }, + { + "epoch": 0.7686126270076022, + "grad_norm": 10.0125093460083, + "learning_rate": 3.71897895498733e-05, + "loss": 2.9354, + "step": 2472500 + }, + { + "epoch": 0.7687680592880891, + "grad_norm": 10.27791976928711, + "learning_rate": 3.718719901186518e-05, + "loss": 2.8991, + "step": 2473000 + }, + { + "epoch": 0.768923491568576, + "grad_norm": 8.98253059387207, + "learning_rate": 3.718460847385707e-05, + "loss": 2.8807, + "step": 2473500 + }, + { + "epoch": 0.7690789238490628, + "grad_norm": 6.8341827392578125, + "learning_rate": 3.718201793584896e-05, + "loss": 2.873, + "step": 2474000 + }, + { + "epoch": 0.7692343561295497, + "grad_norm": 11.017324447631836, + "learning_rate": 3.717942739784084e-05, + "loss": 2.9463, + "step": 2474500 + }, + { + "epoch": 0.7693897884100366, + "grad_norm": 9.29404354095459, + "learning_rate": 3.7176836859832724e-05, + "loss": 2.901, + "step": 2475000 + }, + { + "epoch": 0.7695452206905234, + "grad_norm": 19.767240524291992, + "learning_rate": 3.717424632182461e-05, + "loss": 2.9417, + "step": 2475500 + }, + { + "epoch": 0.7697006529710103, + "grad_norm": 8.709508895874023, + "learning_rate": 3.717165578381649e-05, + "loss": 2.9384, + "step": 2476000 + }, + { + "epoch": 0.7698560852514972, + "grad_norm": 10.012300491333008, + "learning_rate": 3.716906524580838e-05, + "loss": 2.8824, + "step": 2476500 + }, + { + "epoch": 0.770011517531984, + "grad_norm": 8.791191101074219, + "learning_rate": 3.7166474707800266e-05, + "loss": 2.9091, + "step": 2477000 + }, + { + "epoch": 0.7701669498124709, + "grad_norm": 7.970595359802246, + "learning_rate": 3.716388416979215e-05, + "loss": 2.934, + "step": 2477500 + }, + { + "epoch": 0.7703223820929578, + "grad_norm": 11.98901081085205, + "learning_rate": 3.716129363178404e-05, + "loss": 2.9302, + "step": 2478000 + }, + { + "epoch": 0.7704778143734448, + "grad_norm": 11.879839897155762, + "learning_rate": 3.715870309377592e-05, + "loss": 2.8919, + "step": 2478500 + }, + { + "epoch": 0.7706332466539316, + "grad_norm": 41.98666763305664, + "learning_rate": 3.715611255576781e-05, + "loss": 2.9444, + "step": 2479000 + }, + { + "epoch": 0.7707886789344185, + "grad_norm": 35.517433166503906, + "learning_rate": 3.7153522017759695e-05, + "loss": 2.9891, + "step": 2479500 + }, + { + "epoch": 0.7709441112149054, + "grad_norm": 13.746559143066406, + "learning_rate": 3.7150931479751576e-05, + "loss": 2.8615, + "step": 2480000 + }, + { + "epoch": 0.7710995434953922, + "grad_norm": 7.7032694816589355, + "learning_rate": 3.714834094174346e-05, + "loss": 2.9134, + "step": 2480500 + }, + { + "epoch": 0.7712549757758791, + "grad_norm": 17.60002899169922, + "learning_rate": 3.714575040373535e-05, + "loss": 2.9373, + "step": 2481000 + }, + { + "epoch": 0.771410408056366, + "grad_norm": 11.073016166687012, + "learning_rate": 3.714315986572724e-05, + "loss": 2.9301, + "step": 2481500 + }, + { + "epoch": 0.7715658403368528, + "grad_norm": 8.99791145324707, + "learning_rate": 3.7140569327719124e-05, + "loss": 2.9131, + "step": 2482000 + }, + { + "epoch": 0.7717212726173397, + "grad_norm": 8.73534107208252, + "learning_rate": 3.713797878971101e-05, + "loss": 2.9322, + "step": 2482500 + }, + { + "epoch": 0.7718767048978266, + "grad_norm": 10.322243690490723, + "learning_rate": 3.713538825170289e-05, + "loss": 2.9037, + "step": 2483000 + }, + { + "epoch": 0.7720321371783134, + "grad_norm": 8.336400032043457, + "learning_rate": 3.713279771369478e-05, + "loss": 2.9571, + "step": 2483500 + }, + { + "epoch": 0.7721875694588003, + "grad_norm": 12.079141616821289, + "learning_rate": 3.7130207175686666e-05, + "loss": 2.9391, + "step": 2484000 + }, + { + "epoch": 0.7723430017392873, + "grad_norm": 10.82101821899414, + "learning_rate": 3.7127616637678546e-05, + "loss": 2.9015, + "step": 2484500 + }, + { + "epoch": 0.7724984340197741, + "grad_norm": 9.080245971679688, + "learning_rate": 3.7125026099670434e-05, + "loss": 2.9265, + "step": 2485000 + }, + { + "epoch": 0.772653866300261, + "grad_norm": 9.028696060180664, + "learning_rate": 3.7122435561662314e-05, + "loss": 2.8511, + "step": 2485500 + }, + { + "epoch": 0.7728092985807479, + "grad_norm": 8.82813835144043, + "learning_rate": 3.71198450236542e-05, + "loss": 2.9621, + "step": 2486000 + }, + { + "epoch": 0.7729647308612347, + "grad_norm": 10.32519245147705, + "learning_rate": 3.711725448564609e-05, + "loss": 2.9166, + "step": 2486500 + }, + { + "epoch": 0.7731201631417216, + "grad_norm": 16.249053955078125, + "learning_rate": 3.7114663947637975e-05, + "loss": 2.8771, + "step": 2487000 + }, + { + "epoch": 0.7732755954222085, + "grad_norm": 8.312291145324707, + "learning_rate": 3.711207340962986e-05, + "loss": 2.9969, + "step": 2487500 + }, + { + "epoch": 0.7734310277026953, + "grad_norm": 8.553740501403809, + "learning_rate": 3.710948287162175e-05, + "loss": 2.9095, + "step": 2488000 + }, + { + "epoch": 0.7735864599831822, + "grad_norm": 6.004056930541992, + "learning_rate": 3.710689233361363e-05, + "loss": 2.9263, + "step": 2488500 + }, + { + "epoch": 0.7737418922636691, + "grad_norm": 7.511846542358398, + "learning_rate": 3.710430179560552e-05, + "loss": 2.8728, + "step": 2489000 + }, + { + "epoch": 0.7738973245441559, + "grad_norm": 8.17834758758545, + "learning_rate": 3.7101711257597404e-05, + "loss": 2.8741, + "step": 2489500 + }, + { + "epoch": 0.7740527568246428, + "grad_norm": 8.860000610351562, + "learning_rate": 3.7099120719589285e-05, + "loss": 2.8978, + "step": 2490000 + }, + { + "epoch": 0.7742081891051298, + "grad_norm": 9.482637405395508, + "learning_rate": 3.709653018158117e-05, + "loss": 2.9144, + "step": 2490500 + }, + { + "epoch": 0.7743636213856167, + "grad_norm": 9.628345489501953, + "learning_rate": 3.709393964357306e-05, + "loss": 2.9286, + "step": 2491000 + }, + { + "epoch": 0.7745190536661035, + "grad_norm": 8.135675430297852, + "learning_rate": 3.7091349105564946e-05, + "loss": 2.9319, + "step": 2491500 + }, + { + "epoch": 0.7746744859465904, + "grad_norm": 7.881032943725586, + "learning_rate": 3.7088758567556833e-05, + "loss": 2.8962, + "step": 2492000 + }, + { + "epoch": 0.7748299182270773, + "grad_norm": 7.686664581298828, + "learning_rate": 3.7086168029548714e-05, + "loss": 2.9096, + "step": 2492500 + }, + { + "epoch": 0.7749853505075641, + "grad_norm": 5.857980728149414, + "learning_rate": 3.70835774915406e-05, + "loss": 2.9598, + "step": 2493000 + }, + { + "epoch": 0.775140782788051, + "grad_norm": 9.300689697265625, + "learning_rate": 3.708098695353249e-05, + "loss": 2.9045, + "step": 2493500 + }, + { + "epoch": 0.7752962150685379, + "grad_norm": 8.387650489807129, + "learning_rate": 3.707839641552437e-05, + "loss": 2.9214, + "step": 2494000 + }, + { + "epoch": 0.7754516473490247, + "grad_norm": 19.761930465698242, + "learning_rate": 3.7075805877516256e-05, + "loss": 2.8528, + "step": 2494500 + }, + { + "epoch": 0.7756070796295116, + "grad_norm": 9.024097442626953, + "learning_rate": 3.707321533950814e-05, + "loss": 2.9405, + "step": 2495000 + }, + { + "epoch": 0.7757625119099985, + "grad_norm": 8.843426704406738, + "learning_rate": 3.707062480150002e-05, + "loss": 2.9681, + "step": 2495500 + }, + { + "epoch": 0.7759179441904853, + "grad_norm": 8.150529861450195, + "learning_rate": 3.706803426349191e-05, + "loss": 2.8996, + "step": 2496000 + }, + { + "epoch": 0.7760733764709723, + "grad_norm": 11.671916961669922, + "learning_rate": 3.70654437254838e-05, + "loss": 2.9031, + "step": 2496500 + }, + { + "epoch": 0.7762288087514592, + "grad_norm": 7.551968097686768, + "learning_rate": 3.7062853187475685e-05, + "loss": 2.9108, + "step": 2497000 + }, + { + "epoch": 0.776384241031946, + "grad_norm": 8.742918014526367, + "learning_rate": 3.706026264946757e-05, + "loss": 2.8957, + "step": 2497500 + }, + { + "epoch": 0.7765396733124329, + "grad_norm": 7.685513496398926, + "learning_rate": 3.705767211145945e-05, + "loss": 2.9293, + "step": 2498000 + }, + { + "epoch": 0.7766951055929198, + "grad_norm": 8.793463706970215, + "learning_rate": 3.705508157345134e-05, + "loss": 2.9152, + "step": 2498500 + }, + { + "epoch": 0.7768505378734066, + "grad_norm": 10.224218368530273, + "learning_rate": 3.7052491035443226e-05, + "loss": 2.917, + "step": 2499000 + }, + { + "epoch": 0.7770059701538935, + "grad_norm": 10.376906394958496, + "learning_rate": 3.704990049743511e-05, + "loss": 2.8716, + "step": 2499500 + }, + { + "epoch": 0.7771614024343804, + "grad_norm": 14.542830467224121, + "learning_rate": 3.7047309959426994e-05, + "loss": 2.8962, + "step": 2500000 + }, + { + "epoch": 0.7773168347148672, + "grad_norm": 8.91773796081543, + "learning_rate": 3.704471942141888e-05, + "loss": 2.9991, + "step": 2500500 + }, + { + "epoch": 0.7774722669953541, + "grad_norm": 7.7382493019104, + "learning_rate": 3.704212888341077e-05, + "loss": 2.9335, + "step": 2501000 + }, + { + "epoch": 0.777627699275841, + "grad_norm": 8.134830474853516, + "learning_rate": 3.7039538345402655e-05, + "loss": 2.9633, + "step": 2501500 + }, + { + "epoch": 0.7777831315563278, + "grad_norm": 7.945289134979248, + "learning_rate": 3.703694780739454e-05, + "loss": 2.9501, + "step": 2502000 + }, + { + "epoch": 0.7779385638368148, + "grad_norm": 7.569042205810547, + "learning_rate": 3.703435726938642e-05, + "loss": 2.9026, + "step": 2502500 + }, + { + "epoch": 0.7780939961173017, + "grad_norm": 7.373986721038818, + "learning_rate": 3.703176673137831e-05, + "loss": 2.9002, + "step": 2503000 + }, + { + "epoch": 0.7782494283977885, + "grad_norm": 8.389754295349121, + "learning_rate": 3.702917619337019e-05, + "loss": 2.9377, + "step": 2503500 + }, + { + "epoch": 0.7784048606782754, + "grad_norm": 6.91206169128418, + "learning_rate": 3.702658565536208e-05, + "loss": 2.8956, + "step": 2504000 + }, + { + "epoch": 0.7785602929587623, + "grad_norm": 9.233710289001465, + "learning_rate": 3.7023995117353965e-05, + "loss": 2.9136, + "step": 2504500 + }, + { + "epoch": 0.7787157252392491, + "grad_norm": 8.977815628051758, + "learning_rate": 3.7021404579345845e-05, + "loss": 2.9092, + "step": 2505000 + }, + { + "epoch": 0.778871157519736, + "grad_norm": 21.69558334350586, + "learning_rate": 3.701881404133773e-05, + "loss": 2.9356, + "step": 2505500 + }, + { + "epoch": 0.7790265898002229, + "grad_norm": 7.4995646476745605, + "learning_rate": 3.701622350332962e-05, + "loss": 2.8804, + "step": 2506000 + }, + { + "epoch": 0.7791820220807097, + "grad_norm": 8.78020191192627, + "learning_rate": 3.701363296532151e-05, + "loss": 2.9148, + "step": 2506500 + }, + { + "epoch": 0.7793374543611966, + "grad_norm": 8.51196575164795, + "learning_rate": 3.7011042427313394e-05, + "loss": 2.9294, + "step": 2507000 + }, + { + "epoch": 0.7794928866416835, + "grad_norm": 10.999595642089844, + "learning_rate": 3.700845188930528e-05, + "loss": 2.9682, + "step": 2507500 + }, + { + "epoch": 0.7796483189221703, + "grad_norm": 26.98992156982422, + "learning_rate": 3.700586135129716e-05, + "loss": 2.9546, + "step": 2508000 + }, + { + "epoch": 0.7798037512026573, + "grad_norm": 8.548038482666016, + "learning_rate": 3.700327081328905e-05, + "loss": 2.9432, + "step": 2508500 + }, + { + "epoch": 0.7799591834831442, + "grad_norm": 7.825409412384033, + "learning_rate": 3.700068027528093e-05, + "loss": 2.9047, + "step": 2509000 + }, + { + "epoch": 0.7801146157636311, + "grad_norm": 9.573442459106445, + "learning_rate": 3.6998089737272816e-05, + "loss": 2.9316, + "step": 2509500 + }, + { + "epoch": 0.7802700480441179, + "grad_norm": 9.330910682678223, + "learning_rate": 3.69954991992647e-05, + "loss": 2.9215, + "step": 2510000 + }, + { + "epoch": 0.7804254803246048, + "grad_norm": 18.893552780151367, + "learning_rate": 3.6992908661256584e-05, + "loss": 2.8496, + "step": 2510500 + }, + { + "epoch": 0.7805809126050917, + "grad_norm": 8.358064651489258, + "learning_rate": 3.699031812324848e-05, + "loss": 2.9524, + "step": 2511000 + }, + { + "epoch": 0.7807363448855785, + "grad_norm": 8.166193962097168, + "learning_rate": 3.6987727585240365e-05, + "loss": 2.908, + "step": 2511500 + }, + { + "epoch": 0.7808917771660654, + "grad_norm": 9.050308227539062, + "learning_rate": 3.6985137047232245e-05, + "loss": 2.9653, + "step": 2512000 + }, + { + "epoch": 0.7810472094465523, + "grad_norm": 7.364532470703125, + "learning_rate": 3.698254650922413e-05, + "loss": 2.8857, + "step": 2512500 + }, + { + "epoch": 0.7812026417270391, + "grad_norm": 8.818592071533203, + "learning_rate": 3.697995597121602e-05, + "loss": 2.9321, + "step": 2513000 + }, + { + "epoch": 0.781358074007526, + "grad_norm": 16.592533111572266, + "learning_rate": 3.69773654332079e-05, + "loss": 2.8982, + "step": 2513500 + }, + { + "epoch": 0.7815135062880129, + "grad_norm": 7.388978004455566, + "learning_rate": 3.697477489519979e-05, + "loss": 2.9062, + "step": 2514000 + }, + { + "epoch": 0.7816689385684998, + "grad_norm": 11.365740776062012, + "learning_rate": 3.697218435719167e-05, + "loss": 2.9207, + "step": 2514500 + }, + { + "epoch": 0.7818243708489867, + "grad_norm": 8.310303688049316, + "learning_rate": 3.6969593819183554e-05, + "loss": 2.9515, + "step": 2515000 + }, + { + "epoch": 0.7819798031294736, + "grad_norm": 13.657231330871582, + "learning_rate": 3.696700328117544e-05, + "loss": 2.9431, + "step": 2515500 + }, + { + "epoch": 0.7821352354099604, + "grad_norm": 10.48801326751709, + "learning_rate": 3.696441274316733e-05, + "loss": 2.9101, + "step": 2516000 + }, + { + "epoch": 0.7822906676904473, + "grad_norm": 7.607305526733398, + "learning_rate": 3.6961822205159216e-05, + "loss": 2.8959, + "step": 2516500 + }, + { + "epoch": 0.7824460999709342, + "grad_norm": 9.099597930908203, + "learning_rate": 3.69592316671511e-05, + "loss": 2.8967, + "step": 2517000 + }, + { + "epoch": 0.782601532251421, + "grad_norm": 9.616026878356934, + "learning_rate": 3.695664112914298e-05, + "loss": 2.9041, + "step": 2517500 + }, + { + "epoch": 0.7827569645319079, + "grad_norm": 7.188083171844482, + "learning_rate": 3.695405059113487e-05, + "loss": 2.911, + "step": 2518000 + }, + { + "epoch": 0.7829123968123948, + "grad_norm": 9.986123085021973, + "learning_rate": 3.695146005312676e-05, + "loss": 2.8841, + "step": 2518500 + }, + { + "epoch": 0.7830678290928816, + "grad_norm": 6.815340042114258, + "learning_rate": 3.694886951511864e-05, + "loss": 2.8794, + "step": 2519000 + }, + { + "epoch": 0.7832232613733685, + "grad_norm": 9.172028541564941, + "learning_rate": 3.6946278977110525e-05, + "loss": 2.9249, + "step": 2519500 + }, + { + "epoch": 0.7833786936538554, + "grad_norm": 10.194438934326172, + "learning_rate": 3.694368843910241e-05, + "loss": 2.9497, + "step": 2520000 + }, + { + "epoch": 0.7835341259343424, + "grad_norm": 9.410412788391113, + "learning_rate": 3.694109790109429e-05, + "loss": 2.9124, + "step": 2520500 + }, + { + "epoch": 0.7836895582148292, + "grad_norm": 8.354296684265137, + "learning_rate": 3.693850736308619e-05, + "loss": 2.9256, + "step": 2521000 + }, + { + "epoch": 0.7838449904953161, + "grad_norm": 8.988810539245605, + "learning_rate": 3.693591682507807e-05, + "loss": 2.8757, + "step": 2521500 + }, + { + "epoch": 0.784000422775803, + "grad_norm": 8.780189514160156, + "learning_rate": 3.6933326287069954e-05, + "loss": 2.8914, + "step": 2522000 + }, + { + "epoch": 0.7841558550562898, + "grad_norm": 12.196769714355469, + "learning_rate": 3.693073574906184e-05, + "loss": 2.9483, + "step": 2522500 + }, + { + "epoch": 0.7843112873367767, + "grad_norm": 10.716617584228516, + "learning_rate": 3.692814521105372e-05, + "loss": 2.8968, + "step": 2523000 + }, + { + "epoch": 0.7844667196172636, + "grad_norm": 10.095123291015625, + "learning_rate": 3.692555467304561e-05, + "loss": 2.9268, + "step": 2523500 + }, + { + "epoch": 0.7846221518977504, + "grad_norm": 9.141950607299805, + "learning_rate": 3.6922964135037496e-05, + "loss": 2.9321, + "step": 2524000 + }, + { + "epoch": 0.7847775841782373, + "grad_norm": 9.645607948303223, + "learning_rate": 3.6920373597029376e-05, + "loss": 2.8865, + "step": 2524500 + }, + { + "epoch": 0.7849330164587242, + "grad_norm": 7.908655643463135, + "learning_rate": 3.6917783059021264e-05, + "loss": 2.9011, + "step": 2525000 + }, + { + "epoch": 0.785088448739211, + "grad_norm": 8.97266674041748, + "learning_rate": 3.691519252101315e-05, + "loss": 2.9242, + "step": 2525500 + }, + { + "epoch": 0.7852438810196979, + "grad_norm": 8.00079345703125, + "learning_rate": 3.691260198300504e-05, + "loss": 2.9137, + "step": 2526000 + }, + { + "epoch": 0.7853993133001848, + "grad_norm": 9.866944313049316, + "learning_rate": 3.6910011444996925e-05, + "loss": 2.9269, + "step": 2526500 + }, + { + "epoch": 0.7855547455806717, + "grad_norm": 10.905323028564453, + "learning_rate": 3.6907420906988805e-05, + "loss": 2.91, + "step": 2527000 + }, + { + "epoch": 0.7857101778611586, + "grad_norm": 7.760676383972168, + "learning_rate": 3.690483036898069e-05, + "loss": 2.9478, + "step": 2527500 + }, + { + "epoch": 0.7858656101416455, + "grad_norm": 10.450756072998047, + "learning_rate": 3.690223983097258e-05, + "loss": 2.9444, + "step": 2528000 + }, + { + "epoch": 0.7860210424221323, + "grad_norm": 9.818081855773926, + "learning_rate": 3.689964929296446e-05, + "loss": 2.9222, + "step": 2528500 + }, + { + "epoch": 0.7861764747026192, + "grad_norm": 8.814321517944336, + "learning_rate": 3.689705875495635e-05, + "loss": 2.9068, + "step": 2529000 + }, + { + "epoch": 0.7863319069831061, + "grad_norm": 9.116321563720703, + "learning_rate": 3.6894468216948234e-05, + "loss": 2.9045, + "step": 2529500 + }, + { + "epoch": 0.7864873392635929, + "grad_norm": 9.717029571533203, + "learning_rate": 3.6891877678940115e-05, + "loss": 2.8863, + "step": 2530000 + }, + { + "epoch": 0.7866427715440798, + "grad_norm": 7.202411651611328, + "learning_rate": 3.6889287140932e-05, + "loss": 2.8956, + "step": 2530500 + }, + { + "epoch": 0.7867982038245667, + "grad_norm": 9.784390449523926, + "learning_rate": 3.6886696602923896e-05, + "loss": 2.9326, + "step": 2531000 + }, + { + "epoch": 0.7869536361050535, + "grad_norm": 9.139976501464844, + "learning_rate": 3.6884106064915776e-05, + "loss": 2.8994, + "step": 2531500 + }, + { + "epoch": 0.7871090683855404, + "grad_norm": 7.318473815917969, + "learning_rate": 3.688151552690766e-05, + "loss": 2.8723, + "step": 2532000 + }, + { + "epoch": 0.7872645006660273, + "grad_norm": 6.847056865692139, + "learning_rate": 3.687892498889955e-05, + "loss": 2.9103, + "step": 2532500 + }, + { + "epoch": 0.7874199329465142, + "grad_norm": 10.8363676071167, + "learning_rate": 3.687633445089143e-05, + "loss": 2.8981, + "step": 2533000 + }, + { + "epoch": 0.7875753652270011, + "grad_norm": 10.528130531311035, + "learning_rate": 3.687374391288332e-05, + "loss": 2.9259, + "step": 2533500 + }, + { + "epoch": 0.787730797507488, + "grad_norm": 7.13125467300415, + "learning_rate": 3.68711533748752e-05, + "loss": 2.9081, + "step": 2534000 + }, + { + "epoch": 0.7878862297879748, + "grad_norm": 7.763057708740234, + "learning_rate": 3.6868562836867086e-05, + "loss": 2.887, + "step": 2534500 + }, + { + "epoch": 0.7880416620684617, + "grad_norm": 15.552785873413086, + "learning_rate": 3.686597229885897e-05, + "loss": 2.8872, + "step": 2535000 + }, + { + "epoch": 0.7881970943489486, + "grad_norm": 7.560677528381348, + "learning_rate": 3.686338176085086e-05, + "loss": 2.8742, + "step": 2535500 + }, + { + "epoch": 0.7883525266294354, + "grad_norm": 13.613449096679688, + "learning_rate": 3.686079122284275e-05, + "loss": 2.9501, + "step": 2536000 + }, + { + "epoch": 0.7885079589099223, + "grad_norm": 10.179408073425293, + "learning_rate": 3.6858200684834634e-05, + "loss": 2.9123, + "step": 2536500 + }, + { + "epoch": 0.7886633911904092, + "grad_norm": 7.246852397918701, + "learning_rate": 3.6855610146826515e-05, + "loss": 2.9188, + "step": 2537000 + }, + { + "epoch": 0.788818823470896, + "grad_norm": 10.184661865234375, + "learning_rate": 3.68530196088184e-05, + "loss": 2.9403, + "step": 2537500 + }, + { + "epoch": 0.7889742557513829, + "grad_norm": 8.86436939239502, + "learning_rate": 3.685042907081029e-05, + "loss": 2.8827, + "step": 2538000 + }, + { + "epoch": 0.7891296880318698, + "grad_norm": 10.803474426269531, + "learning_rate": 3.684783853280217e-05, + "loss": 2.9118, + "step": 2538500 + }, + { + "epoch": 0.7892851203123568, + "grad_norm": 13.65384292602539, + "learning_rate": 3.6845247994794056e-05, + "loss": 2.9194, + "step": 2539000 + }, + { + "epoch": 0.7894405525928436, + "grad_norm": 9.123188972473145, + "learning_rate": 3.684265745678594e-05, + "loss": 2.903, + "step": 2539500 + }, + { + "epoch": 0.7895959848733305, + "grad_norm": 8.52552604675293, + "learning_rate": 3.6840066918777824e-05, + "loss": 2.9198, + "step": 2540000 + }, + { + "epoch": 0.7897514171538174, + "grad_norm": 8.815842628479004, + "learning_rate": 3.683747638076971e-05, + "loss": 2.9165, + "step": 2540500 + }, + { + "epoch": 0.7899068494343042, + "grad_norm": 6.83341121673584, + "learning_rate": 3.68348858427616e-05, + "loss": 2.9291, + "step": 2541000 + }, + { + "epoch": 0.7900622817147911, + "grad_norm": 9.2157564163208, + "learning_rate": 3.6832295304753485e-05, + "loss": 2.8826, + "step": 2541500 + }, + { + "epoch": 0.790217713995278, + "grad_norm": 11.972740173339844, + "learning_rate": 3.682970476674537e-05, + "loss": 2.9608, + "step": 2542000 + }, + { + "epoch": 0.7903731462757648, + "grad_norm": 7.078418731689453, + "learning_rate": 3.682711422873725e-05, + "loss": 2.9289, + "step": 2542500 + }, + { + "epoch": 0.7905285785562517, + "grad_norm": 8.081480979919434, + "learning_rate": 3.682452369072914e-05, + "loss": 2.9, + "step": 2543000 + }, + { + "epoch": 0.7906840108367386, + "grad_norm": 5.968482971191406, + "learning_rate": 3.682193315272103e-05, + "loss": 2.9155, + "step": 2543500 + }, + { + "epoch": 0.7908394431172254, + "grad_norm": 9.320084571838379, + "learning_rate": 3.681934261471291e-05, + "loss": 2.947, + "step": 2544000 + }, + { + "epoch": 0.7909948753977123, + "grad_norm": 9.23572826385498, + "learning_rate": 3.6816752076704795e-05, + "loss": 2.8981, + "step": 2544500 + }, + { + "epoch": 0.7911503076781993, + "grad_norm": 10.990642547607422, + "learning_rate": 3.681416153869668e-05, + "loss": 2.9233, + "step": 2545000 + }, + { + "epoch": 0.7913057399586861, + "grad_norm": 14.437932014465332, + "learning_rate": 3.681157100068857e-05, + "loss": 2.9198, + "step": 2545500 + }, + { + "epoch": 0.791461172239173, + "grad_norm": 11.242708206176758, + "learning_rate": 3.6808980462680456e-05, + "loss": 2.8898, + "step": 2546000 + }, + { + "epoch": 0.7916166045196599, + "grad_norm": 9.563525199890137, + "learning_rate": 3.6806389924672337e-05, + "loss": 2.9106, + "step": 2546500 + }, + { + "epoch": 0.7917720368001467, + "grad_norm": 22.428728103637695, + "learning_rate": 3.6803799386664224e-05, + "loss": 2.9096, + "step": 2547000 + }, + { + "epoch": 0.7919274690806336, + "grad_norm": 8.14686107635498, + "learning_rate": 3.680120884865611e-05, + "loss": 2.9006, + "step": 2547500 + }, + { + "epoch": 0.7920829013611205, + "grad_norm": 7.210602283477783, + "learning_rate": 3.679861831064799e-05, + "loss": 2.8913, + "step": 2548000 + }, + { + "epoch": 0.7922383336416073, + "grad_norm": 9.664528846740723, + "learning_rate": 3.679602777263988e-05, + "loss": 2.8701, + "step": 2548500 + }, + { + "epoch": 0.7923937659220942, + "grad_norm": 10.036486625671387, + "learning_rate": 3.6793437234631766e-05, + "loss": 2.898, + "step": 2549000 + }, + { + "epoch": 0.7925491982025811, + "grad_norm": 8.484517097473145, + "learning_rate": 3.6790846696623646e-05, + "loss": 2.9153, + "step": 2549500 + }, + { + "epoch": 0.7927046304830679, + "grad_norm": 7.479803085327148, + "learning_rate": 3.678825615861553e-05, + "loss": 2.8774, + "step": 2550000 + }, + { + "epoch": 0.7928600627635548, + "grad_norm": 10.352155685424805, + "learning_rate": 3.678566562060742e-05, + "loss": 2.9529, + "step": 2550500 + }, + { + "epoch": 0.7930154950440418, + "grad_norm": 5.736881256103516, + "learning_rate": 3.678307508259931e-05, + "loss": 2.9342, + "step": 2551000 + }, + { + "epoch": 0.7931709273245287, + "grad_norm": 9.618227005004883, + "learning_rate": 3.6780484544591195e-05, + "loss": 2.9491, + "step": 2551500 + }, + { + "epoch": 0.7933263596050155, + "grad_norm": 7.638864040374756, + "learning_rate": 3.6777894006583075e-05, + "loss": 2.8989, + "step": 2552000 + }, + { + "epoch": 0.7934817918855024, + "grad_norm": 13.02595043182373, + "learning_rate": 3.677530346857496e-05, + "loss": 2.9201, + "step": 2552500 + }, + { + "epoch": 0.7936372241659893, + "grad_norm": 8.200037956237793, + "learning_rate": 3.677271293056685e-05, + "loss": 2.9102, + "step": 2553000 + }, + { + "epoch": 0.7937926564464761, + "grad_norm": 7.758111476898193, + "learning_rate": 3.677012239255873e-05, + "loss": 2.9553, + "step": 2553500 + }, + { + "epoch": 0.793948088726963, + "grad_norm": 11.31603717803955, + "learning_rate": 3.676753185455062e-05, + "loss": 2.9488, + "step": 2554000 + }, + { + "epoch": 0.7941035210074499, + "grad_norm": 9.419448852539062, + "learning_rate": 3.6764941316542504e-05, + "loss": 2.9255, + "step": 2554500 + }, + { + "epoch": 0.7942589532879367, + "grad_norm": 8.6755952835083, + "learning_rate": 3.676235077853439e-05, + "loss": 2.9212, + "step": 2555000 + }, + { + "epoch": 0.7944143855684236, + "grad_norm": 8.911324501037598, + "learning_rate": 3.675976024052628e-05, + "loss": 2.9061, + "step": 2555500 + }, + { + "epoch": 0.7945698178489105, + "grad_norm": 6.963047504425049, + "learning_rate": 3.6757169702518165e-05, + "loss": 2.8908, + "step": 2556000 + }, + { + "epoch": 0.7947252501293973, + "grad_norm": 9.711777687072754, + "learning_rate": 3.6754579164510046e-05, + "loss": 2.9053, + "step": 2556500 + }, + { + "epoch": 0.7948806824098843, + "grad_norm": 7.738769054412842, + "learning_rate": 3.675198862650193e-05, + "loss": 2.9385, + "step": 2557000 + }, + { + "epoch": 0.7950361146903712, + "grad_norm": 8.053642272949219, + "learning_rate": 3.674939808849381e-05, + "loss": 2.8951, + "step": 2557500 + }, + { + "epoch": 0.795191546970858, + "grad_norm": 9.413579940795898, + "learning_rate": 3.67468075504857e-05, + "loss": 2.9134, + "step": 2558000 + }, + { + "epoch": 0.7953469792513449, + "grad_norm": 9.20220947265625, + "learning_rate": 3.674421701247759e-05, + "loss": 2.9247, + "step": 2558500 + }, + { + "epoch": 0.7955024115318318, + "grad_norm": 9.610201835632324, + "learning_rate": 3.674162647446947e-05, + "loss": 2.8745, + "step": 2559000 + }, + { + "epoch": 0.7956578438123186, + "grad_norm": 8.637787818908691, + "learning_rate": 3.6739035936461355e-05, + "loss": 2.8939, + "step": 2559500 + }, + { + "epoch": 0.7958132760928055, + "grad_norm": 13.792863845825195, + "learning_rate": 3.673644539845324e-05, + "loss": 2.8933, + "step": 2560000 + }, + { + "epoch": 0.7959687083732924, + "grad_norm": 10.423511505126953, + "learning_rate": 3.673385486044513e-05, + "loss": 2.9046, + "step": 2560500 + }, + { + "epoch": 0.7961241406537792, + "grad_norm": 29.868038177490234, + "learning_rate": 3.6731264322437017e-05, + "loss": 2.9234, + "step": 2561000 + }, + { + "epoch": 0.7962795729342661, + "grad_norm": 9.692220687866211, + "learning_rate": 3.6728673784428904e-05, + "loss": 2.9212, + "step": 2561500 + }, + { + "epoch": 0.796435005214753, + "grad_norm": 8.043336868286133, + "learning_rate": 3.6726083246420784e-05, + "loss": 2.9422, + "step": 2562000 + }, + { + "epoch": 0.7965904374952398, + "grad_norm": 13.009507179260254, + "learning_rate": 3.672349270841267e-05, + "loss": 2.9133, + "step": 2562500 + }, + { + "epoch": 0.7967458697757268, + "grad_norm": 7.511747360229492, + "learning_rate": 3.672090217040455e-05, + "loss": 2.8915, + "step": 2563000 + }, + { + "epoch": 0.7969013020562137, + "grad_norm": 8.673309326171875, + "learning_rate": 3.671831163239644e-05, + "loss": 2.9548, + "step": 2563500 + }, + { + "epoch": 0.7970567343367005, + "grad_norm": 9.424271583557129, + "learning_rate": 3.6715721094388326e-05, + "loss": 2.8908, + "step": 2564000 + }, + { + "epoch": 0.7972121666171874, + "grad_norm": 7.649267196655273, + "learning_rate": 3.6713130556380206e-05, + "loss": 2.9302, + "step": 2564500 + }, + { + "epoch": 0.7973675988976743, + "grad_norm": 10.11984634399414, + "learning_rate": 3.67105400183721e-05, + "loss": 2.9338, + "step": 2565000 + }, + { + "epoch": 0.7975230311781611, + "grad_norm": 8.054205894470215, + "learning_rate": 3.670794948036399e-05, + "loss": 2.9315, + "step": 2565500 + }, + { + "epoch": 0.797678463458648, + "grad_norm": 7.765100479125977, + "learning_rate": 3.670535894235587e-05, + "loss": 2.9001, + "step": 2566000 + }, + { + "epoch": 0.7978338957391349, + "grad_norm": 7.818370342254639, + "learning_rate": 3.6702768404347755e-05, + "loss": 2.9339, + "step": 2566500 + }, + { + "epoch": 0.7979893280196217, + "grad_norm": 7.396254539489746, + "learning_rate": 3.670017786633964e-05, + "loss": 2.9188, + "step": 2567000 + }, + { + "epoch": 0.7981447603001086, + "grad_norm": 9.735835075378418, + "learning_rate": 3.669758732833152e-05, + "loss": 2.892, + "step": 2567500 + }, + { + "epoch": 0.7983001925805955, + "grad_norm": 10.596372604370117, + "learning_rate": 3.669499679032341e-05, + "loss": 2.9486, + "step": 2568000 + }, + { + "epoch": 0.7984556248610823, + "grad_norm": 9.507933616638184, + "learning_rate": 3.66924062523153e-05, + "loss": 2.873, + "step": 2568500 + }, + { + "epoch": 0.7986110571415693, + "grad_norm": 8.99848461151123, + "learning_rate": 3.668981571430718e-05, + "loss": 2.9389, + "step": 2569000 + }, + { + "epoch": 0.7987664894220562, + "grad_norm": 12.806042671203613, + "learning_rate": 3.6687225176299064e-05, + "loss": 2.9057, + "step": 2569500 + }, + { + "epoch": 0.7989219217025431, + "grad_norm": 7.114687919616699, + "learning_rate": 3.668463463829095e-05, + "loss": 2.9512, + "step": 2570000 + }, + { + "epoch": 0.7990773539830299, + "grad_norm": 8.55597972869873, + "learning_rate": 3.668204410028284e-05, + "loss": 2.9641, + "step": 2570500 + }, + { + "epoch": 0.7992327862635168, + "grad_norm": 9.736856460571289, + "learning_rate": 3.6679453562274726e-05, + "loss": 2.9165, + "step": 2571000 + }, + { + "epoch": 0.7993882185440037, + "grad_norm": 10.874151229858398, + "learning_rate": 3.6676863024266606e-05, + "loss": 2.9225, + "step": 2571500 + }, + { + "epoch": 0.7995436508244905, + "grad_norm": 7.028905391693115, + "learning_rate": 3.667427248625849e-05, + "loss": 2.9433, + "step": 2572000 + }, + { + "epoch": 0.7996990831049774, + "grad_norm": 9.293508529663086, + "learning_rate": 3.667168194825038e-05, + "loss": 2.9043, + "step": 2572500 + }, + { + "epoch": 0.7998545153854643, + "grad_norm": 14.924574851989746, + "learning_rate": 3.666909141024226e-05, + "loss": 2.9588, + "step": 2573000 + }, + { + "epoch": 0.8000099476659511, + "grad_norm": 7.6522321701049805, + "learning_rate": 3.666650087223415e-05, + "loss": 2.8735, + "step": 2573500 + }, + { + "epoch": 0.800165379946438, + "grad_norm": 13.246827125549316, + "learning_rate": 3.6663910334226035e-05, + "loss": 2.8887, + "step": 2574000 + }, + { + "epoch": 0.8003208122269249, + "grad_norm": 6.370206356048584, + "learning_rate": 3.6661319796217916e-05, + "loss": 2.9237, + "step": 2574500 + }, + { + "epoch": 0.8004762445074118, + "grad_norm": 25.58489227294922, + "learning_rate": 3.665872925820981e-05, + "loss": 2.9149, + "step": 2575000 + }, + { + "epoch": 0.8006316767878987, + "grad_norm": 19.879459381103516, + "learning_rate": 3.665613872020169e-05, + "loss": 2.9232, + "step": 2575500 + }, + { + "epoch": 0.8007871090683856, + "grad_norm": 8.264145851135254, + "learning_rate": 3.665354818219358e-05, + "loss": 2.8774, + "step": 2576000 + }, + { + "epoch": 0.8009425413488724, + "grad_norm": 8.93844985961914, + "learning_rate": 3.6650957644185464e-05, + "loss": 2.9226, + "step": 2576500 + }, + { + "epoch": 0.8010979736293593, + "grad_norm": 10.398225784301758, + "learning_rate": 3.6648367106177345e-05, + "loss": 2.8884, + "step": 2577000 + }, + { + "epoch": 0.8012534059098462, + "grad_norm": 27.602062225341797, + "learning_rate": 3.664577656816923e-05, + "loss": 2.9416, + "step": 2577500 + }, + { + "epoch": 0.801408838190333, + "grad_norm": 7.428812026977539, + "learning_rate": 3.664318603016112e-05, + "loss": 2.9564, + "step": 2578000 + }, + { + "epoch": 0.8015642704708199, + "grad_norm": 7.6780219078063965, + "learning_rate": 3.6640595492153e-05, + "loss": 2.9339, + "step": 2578500 + }, + { + "epoch": 0.8017197027513068, + "grad_norm": 12.67911434173584, + "learning_rate": 3.6638004954144886e-05, + "loss": 2.8975, + "step": 2579000 + }, + { + "epoch": 0.8018751350317936, + "grad_norm": 8.543790817260742, + "learning_rate": 3.6635414416136774e-05, + "loss": 2.896, + "step": 2579500 + }, + { + "epoch": 0.8020305673122805, + "grad_norm": 10.3424711227417, + "learning_rate": 3.663282387812866e-05, + "loss": 2.8503, + "step": 2580000 + }, + { + "epoch": 0.8021859995927674, + "grad_norm": 23.33956527709961, + "learning_rate": 3.663023334012055e-05, + "loss": 2.851, + "step": 2580500 + }, + { + "epoch": 0.8023414318732544, + "grad_norm": 7.901225566864014, + "learning_rate": 3.662764280211243e-05, + "loss": 2.917, + "step": 2581000 + }, + { + "epoch": 0.8024968641537412, + "grad_norm": 17.332454681396484, + "learning_rate": 3.6625052264104315e-05, + "loss": 2.9476, + "step": 2581500 + }, + { + "epoch": 0.8026522964342281, + "grad_norm": 8.708745002746582, + "learning_rate": 3.66224617260962e-05, + "loss": 2.931, + "step": 2582000 + }, + { + "epoch": 0.802807728714715, + "grad_norm": 8.236913681030273, + "learning_rate": 3.661987118808808e-05, + "loss": 2.9185, + "step": 2582500 + }, + { + "epoch": 0.8029631609952018, + "grad_norm": 6.92108678817749, + "learning_rate": 3.661728065007997e-05, + "loss": 2.9194, + "step": 2583000 + }, + { + "epoch": 0.8031185932756887, + "grad_norm": 12.73745346069336, + "learning_rate": 3.661469011207186e-05, + "loss": 2.9307, + "step": 2583500 + }, + { + "epoch": 0.8032740255561756, + "grad_norm": 10.385316848754883, + "learning_rate": 3.661209957406374e-05, + "loss": 2.9065, + "step": 2584000 + }, + { + "epoch": 0.8034294578366624, + "grad_norm": 9.173115730285645, + "learning_rate": 3.6609509036055625e-05, + "loss": 2.9135, + "step": 2584500 + }, + { + "epoch": 0.8035848901171493, + "grad_norm": 8.132427215576172, + "learning_rate": 3.660691849804752e-05, + "loss": 2.8964, + "step": 2585000 + }, + { + "epoch": 0.8037403223976362, + "grad_norm": 10.376495361328125, + "learning_rate": 3.66043279600394e-05, + "loss": 2.9347, + "step": 2585500 + }, + { + "epoch": 0.803895754678123, + "grad_norm": 9.916460990905762, + "learning_rate": 3.6601737422031286e-05, + "loss": 2.9012, + "step": 2586000 + }, + { + "epoch": 0.8040511869586099, + "grad_norm": 9.030915260314941, + "learning_rate": 3.659914688402317e-05, + "loss": 2.9391, + "step": 2586500 + }, + { + "epoch": 0.8042066192390969, + "grad_norm": 9.132777214050293, + "learning_rate": 3.6596556346015054e-05, + "loss": 2.9287, + "step": 2587000 + }, + { + "epoch": 0.8043620515195837, + "grad_norm": 9.485337257385254, + "learning_rate": 3.659396580800694e-05, + "loss": 2.9034, + "step": 2587500 + }, + { + "epoch": 0.8045174838000706, + "grad_norm": 10.060250282287598, + "learning_rate": 3.659137526999882e-05, + "loss": 2.8943, + "step": 2588000 + }, + { + "epoch": 0.8046729160805575, + "grad_norm": 20.903152465820312, + "learning_rate": 3.658878473199071e-05, + "loss": 2.9972, + "step": 2588500 + }, + { + "epoch": 0.8048283483610443, + "grad_norm": 10.96153450012207, + "learning_rate": 3.6586194193982596e-05, + "loss": 2.8963, + "step": 2589000 + }, + { + "epoch": 0.8049837806415312, + "grad_norm": 10.287260055541992, + "learning_rate": 3.658360365597448e-05, + "loss": 2.9404, + "step": 2589500 + }, + { + "epoch": 0.8051392129220181, + "grad_norm": 12.275816917419434, + "learning_rate": 3.658101311796637e-05, + "loss": 2.9118, + "step": 2590000 + }, + { + "epoch": 0.8052946452025049, + "grad_norm": 8.738741874694824, + "learning_rate": 3.657842257995826e-05, + "loss": 2.9363, + "step": 2590500 + }, + { + "epoch": 0.8054500774829918, + "grad_norm": 8.944178581237793, + "learning_rate": 3.657583204195014e-05, + "loss": 2.8893, + "step": 2591000 + }, + { + "epoch": 0.8056055097634787, + "grad_norm": 8.823131561279297, + "learning_rate": 3.6573241503942025e-05, + "loss": 2.8926, + "step": 2591500 + }, + { + "epoch": 0.8057609420439655, + "grad_norm": 11.21445083618164, + "learning_rate": 3.657065096593391e-05, + "loss": 2.9018, + "step": 2592000 + }, + { + "epoch": 0.8059163743244524, + "grad_norm": 16.686748504638672, + "learning_rate": 3.656806042792579e-05, + "loss": 2.9298, + "step": 2592500 + }, + { + "epoch": 0.8060718066049394, + "grad_norm": 8.630138397216797, + "learning_rate": 3.656546988991768e-05, + "loss": 2.9179, + "step": 2593000 + }, + { + "epoch": 0.8062272388854262, + "grad_norm": 9.385637283325195, + "learning_rate": 3.656287935190956e-05, + "loss": 2.9344, + "step": 2593500 + }, + { + "epoch": 0.8063826711659131, + "grad_norm": 11.645265579223633, + "learning_rate": 3.656028881390145e-05, + "loss": 2.9394, + "step": 2594000 + }, + { + "epoch": 0.8065381034464, + "grad_norm": 8.770450592041016, + "learning_rate": 3.6557698275893334e-05, + "loss": 2.9215, + "step": 2594500 + }, + { + "epoch": 0.8066935357268868, + "grad_norm": 9.594437599182129, + "learning_rate": 3.655510773788522e-05, + "loss": 2.8878, + "step": 2595000 + }, + { + "epoch": 0.8068489680073737, + "grad_norm": 8.558621406555176, + "learning_rate": 3.655251719987711e-05, + "loss": 2.9211, + "step": 2595500 + }, + { + "epoch": 0.8070044002878606, + "grad_norm": 7.055568218231201, + "learning_rate": 3.6549926661868995e-05, + "loss": 2.905, + "step": 2596000 + }, + { + "epoch": 0.8071598325683474, + "grad_norm": 19.26178741455078, + "learning_rate": 3.6547336123860876e-05, + "loss": 2.9457, + "step": 2596500 + }, + { + "epoch": 0.8073152648488343, + "grad_norm": 8.990277290344238, + "learning_rate": 3.654474558585276e-05, + "loss": 2.9377, + "step": 2597000 + }, + { + "epoch": 0.8074706971293212, + "grad_norm": 9.802623748779297, + "learning_rate": 3.654215504784465e-05, + "loss": 2.921, + "step": 2597500 + }, + { + "epoch": 0.807626129409808, + "grad_norm": 10.01833724975586, + "learning_rate": 3.653956450983653e-05, + "loss": 2.9338, + "step": 2598000 + }, + { + "epoch": 0.8077815616902949, + "grad_norm": 9.50786018371582, + "learning_rate": 3.653697397182842e-05, + "loss": 2.8356, + "step": 2598500 + }, + { + "epoch": 0.8079369939707819, + "grad_norm": 8.111660957336426, + "learning_rate": 3.6534383433820305e-05, + "loss": 2.9204, + "step": 2599000 + }, + { + "epoch": 0.8080924262512688, + "grad_norm": 9.143425941467285, + "learning_rate": 3.653179289581219e-05, + "loss": 2.8387, + "step": 2599500 + }, + { + "epoch": 0.8082478585317556, + "grad_norm": 8.073235511779785, + "learning_rate": 3.652920235780408e-05, + "loss": 2.8921, + "step": 2600000 + }, + { + "epoch": 0.8084032908122425, + "grad_norm": 10.296046257019043, + "learning_rate": 3.652661181979596e-05, + "loss": 2.878, + "step": 2600500 + }, + { + "epoch": 0.8085587230927294, + "grad_norm": 8.493681907653809, + "learning_rate": 3.6524021281787847e-05, + "loss": 2.8949, + "step": 2601000 + }, + { + "epoch": 0.8087141553732162, + "grad_norm": 8.11523723602295, + "learning_rate": 3.6521430743779734e-05, + "loss": 2.8796, + "step": 2601500 + }, + { + "epoch": 0.8088695876537031, + "grad_norm": 12.08910083770752, + "learning_rate": 3.6518840205771614e-05, + "loss": 2.9374, + "step": 2602000 + }, + { + "epoch": 0.80902501993419, + "grad_norm": 9.010488510131836, + "learning_rate": 3.65162496677635e-05, + "loss": 2.903, + "step": 2602500 + }, + { + "epoch": 0.8091804522146768, + "grad_norm": 8.640110969543457, + "learning_rate": 3.651365912975539e-05, + "loss": 2.9208, + "step": 2603000 + }, + { + "epoch": 0.8093358844951637, + "grad_norm": 10.416131019592285, + "learning_rate": 3.651106859174727e-05, + "loss": 2.9072, + "step": 2603500 + }, + { + "epoch": 0.8094913167756506, + "grad_norm": 8.825004577636719, + "learning_rate": 3.6508478053739156e-05, + "loss": 2.8598, + "step": 2604000 + }, + { + "epoch": 0.8096467490561374, + "grad_norm": 9.355751991271973, + "learning_rate": 3.650588751573104e-05, + "loss": 2.9384, + "step": 2604500 + }, + { + "epoch": 0.8098021813366244, + "grad_norm": 8.307352066040039, + "learning_rate": 3.650329697772293e-05, + "loss": 2.8846, + "step": 2605000 + }, + { + "epoch": 0.8099576136171113, + "grad_norm": 7.980832099914551, + "learning_rate": 3.650070643971482e-05, + "loss": 2.9416, + "step": 2605500 + }, + { + "epoch": 0.8101130458975981, + "grad_norm": 9.917590141296387, + "learning_rate": 3.64981159017067e-05, + "loss": 2.8744, + "step": 2606000 + }, + { + "epoch": 0.810268478178085, + "grad_norm": 8.348762512207031, + "learning_rate": 3.6495525363698585e-05, + "loss": 2.9418, + "step": 2606500 + }, + { + "epoch": 0.8104239104585719, + "grad_norm": 9.190790176391602, + "learning_rate": 3.649293482569047e-05, + "loss": 2.8703, + "step": 2607000 + }, + { + "epoch": 0.8105793427390587, + "grad_norm": 9.995067596435547, + "learning_rate": 3.649034428768235e-05, + "loss": 2.867, + "step": 2607500 + }, + { + "epoch": 0.8107347750195456, + "grad_norm": 8.517544746398926, + "learning_rate": 3.648775374967424e-05, + "loss": 2.8449, + "step": 2608000 + }, + { + "epoch": 0.8108902073000325, + "grad_norm": 23.992563247680664, + "learning_rate": 3.648516321166613e-05, + "loss": 2.8836, + "step": 2608500 + }, + { + "epoch": 0.8110456395805193, + "grad_norm": 9.723713874816895, + "learning_rate": 3.6482572673658014e-05, + "loss": 2.9285, + "step": 2609000 + }, + { + "epoch": 0.8112010718610062, + "grad_norm": 5.907739639282227, + "learning_rate": 3.64799821356499e-05, + "loss": 2.9166, + "step": 2609500 + }, + { + "epoch": 0.8113565041414931, + "grad_norm": 7.9707722663879395, + "learning_rate": 3.647739159764179e-05, + "loss": 2.9259, + "step": 2610000 + }, + { + "epoch": 0.8115119364219799, + "grad_norm": 7.493592262268066, + "learning_rate": 3.647480105963367e-05, + "loss": 2.8773, + "step": 2610500 + }, + { + "epoch": 0.8116673687024669, + "grad_norm": 8.751099586486816, + "learning_rate": 3.6472210521625556e-05, + "loss": 2.9079, + "step": 2611000 + }, + { + "epoch": 0.8118228009829538, + "grad_norm": 8.842279434204102, + "learning_rate": 3.6469619983617436e-05, + "loss": 2.8883, + "step": 2611500 + }, + { + "epoch": 0.8119782332634407, + "grad_norm": 9.148720741271973, + "learning_rate": 3.646702944560932e-05, + "loss": 2.937, + "step": 2612000 + }, + { + "epoch": 0.8121336655439275, + "grad_norm": 9.86596393585205, + "learning_rate": 3.646443890760121e-05, + "loss": 2.8917, + "step": 2612500 + }, + { + "epoch": 0.8122890978244144, + "grad_norm": 5.923008441925049, + "learning_rate": 3.646184836959309e-05, + "loss": 2.9161, + "step": 2613000 + }, + { + "epoch": 0.8124445301049013, + "grad_norm": 8.250937461853027, + "learning_rate": 3.645925783158498e-05, + "loss": 2.934, + "step": 2613500 + }, + { + "epoch": 0.8125999623853881, + "grad_norm": 8.770544052124023, + "learning_rate": 3.6456667293576865e-05, + "loss": 2.9361, + "step": 2614000 + }, + { + "epoch": 0.812755394665875, + "grad_norm": 22.660242080688477, + "learning_rate": 3.645407675556875e-05, + "loss": 2.8977, + "step": 2614500 + }, + { + "epoch": 0.8129108269463619, + "grad_norm": 9.330902099609375, + "learning_rate": 3.645148621756064e-05, + "loss": 2.9041, + "step": 2615000 + }, + { + "epoch": 0.8130662592268487, + "grad_norm": 8.784000396728516, + "learning_rate": 3.6448895679552527e-05, + "loss": 2.9115, + "step": 2615500 + }, + { + "epoch": 0.8132216915073356, + "grad_norm": 10.784143447875977, + "learning_rate": 3.644630514154441e-05, + "loss": 2.8879, + "step": 2616000 + }, + { + "epoch": 0.8133771237878225, + "grad_norm": 7.947121620178223, + "learning_rate": 3.6443714603536294e-05, + "loss": 2.9483, + "step": 2616500 + }, + { + "epoch": 0.8135325560683094, + "grad_norm": 6.435616970062256, + "learning_rate": 3.6441124065528174e-05, + "loss": 2.9415, + "step": 2617000 + }, + { + "epoch": 0.8136879883487963, + "grad_norm": 7.594960689544678, + "learning_rate": 3.643853352752006e-05, + "loss": 2.9217, + "step": 2617500 + }, + { + "epoch": 0.8138434206292832, + "grad_norm": 6.79650354385376, + "learning_rate": 3.643594298951195e-05, + "loss": 2.8964, + "step": 2618000 + }, + { + "epoch": 0.81399885290977, + "grad_norm": 10.99609661102295, + "learning_rate": 3.6433352451503836e-05, + "loss": 2.9306, + "step": 2618500 + }, + { + "epoch": 0.8141542851902569, + "grad_norm": 8.218420028686523, + "learning_rate": 3.643076191349572e-05, + "loss": 2.8792, + "step": 2619000 + }, + { + "epoch": 0.8143097174707438, + "grad_norm": 18.35284423828125, + "learning_rate": 3.642817137548761e-05, + "loss": 2.898, + "step": 2619500 + }, + { + "epoch": 0.8144651497512306, + "grad_norm": 9.623992919921875, + "learning_rate": 3.642558083747949e-05, + "loss": 2.9114, + "step": 2620000 + }, + { + "epoch": 0.8146205820317175, + "grad_norm": 11.563785552978516, + "learning_rate": 3.642299029947138e-05, + "loss": 2.9302, + "step": 2620500 + }, + { + "epoch": 0.8147760143122044, + "grad_norm": 9.970775604248047, + "learning_rate": 3.6420399761463265e-05, + "loss": 2.8963, + "step": 2621000 + }, + { + "epoch": 0.8149314465926912, + "grad_norm": 8.09829044342041, + "learning_rate": 3.6417809223455145e-05, + "loss": 2.9194, + "step": 2621500 + }, + { + "epoch": 0.8150868788731781, + "grad_norm": 8.163605690002441, + "learning_rate": 3.641521868544703e-05, + "loss": 2.9209, + "step": 2622000 + }, + { + "epoch": 0.815242311153665, + "grad_norm": 19.45438003540039, + "learning_rate": 3.641262814743892e-05, + "loss": 2.9193, + "step": 2622500 + }, + { + "epoch": 0.815397743434152, + "grad_norm": 7.207915782928467, + "learning_rate": 3.64100376094308e-05, + "loss": 2.9091, + "step": 2623000 + }, + { + "epoch": 0.8155531757146388, + "grad_norm": 17.871469497680664, + "learning_rate": 3.640744707142269e-05, + "loss": 2.919, + "step": 2623500 + }, + { + "epoch": 0.8157086079951257, + "grad_norm": 7.932417869567871, + "learning_rate": 3.6404856533414574e-05, + "loss": 2.9276, + "step": 2624000 + }, + { + "epoch": 0.8158640402756125, + "grad_norm": 12.547745704650879, + "learning_rate": 3.640226599540646e-05, + "loss": 2.8822, + "step": 2624500 + }, + { + "epoch": 0.8160194725560994, + "grad_norm": 9.106832504272461, + "learning_rate": 3.639967545739835e-05, + "loss": 2.8604, + "step": 2625000 + }, + { + "epoch": 0.8161749048365863, + "grad_norm": 10.583722114562988, + "learning_rate": 3.639708491939023e-05, + "loss": 2.8902, + "step": 2625500 + }, + { + "epoch": 0.8163303371170731, + "grad_norm": 18.30173110961914, + "learning_rate": 3.6394494381382116e-05, + "loss": 2.9425, + "step": 2626000 + }, + { + "epoch": 0.81648576939756, + "grad_norm": 8.428986549377441, + "learning_rate": 3.6391903843374e-05, + "loss": 2.8839, + "step": 2626500 + }, + { + "epoch": 0.8166412016780469, + "grad_norm": 8.3203706741333, + "learning_rate": 3.6389313305365884e-05, + "loss": 2.9091, + "step": 2627000 + }, + { + "epoch": 0.8167966339585337, + "grad_norm": 7.446415901184082, + "learning_rate": 3.638672276735777e-05, + "loss": 2.8931, + "step": 2627500 + }, + { + "epoch": 0.8169520662390206, + "grad_norm": 7.443166255950928, + "learning_rate": 3.638413222934966e-05, + "loss": 2.8727, + "step": 2628000 + }, + { + "epoch": 0.8171074985195075, + "grad_norm": 8.91380786895752, + "learning_rate": 3.6381541691341545e-05, + "loss": 2.8891, + "step": 2628500 + }, + { + "epoch": 0.8172629307999945, + "grad_norm": 7.822865962982178, + "learning_rate": 3.637895115333343e-05, + "loss": 2.9057, + "step": 2629000 + }, + { + "epoch": 0.8174183630804813, + "grad_norm": 9.966245651245117, + "learning_rate": 3.637636061532531e-05, + "loss": 2.9342, + "step": 2629500 + }, + { + "epoch": 0.8175737953609682, + "grad_norm": 9.065882682800293, + "learning_rate": 3.63737700773172e-05, + "loss": 2.8943, + "step": 2630000 + }, + { + "epoch": 0.8177292276414551, + "grad_norm": 7.224596977233887, + "learning_rate": 3.637117953930909e-05, + "loss": 2.9098, + "step": 2630500 + }, + { + "epoch": 0.8178846599219419, + "grad_norm": 8.42428207397461, + "learning_rate": 3.636858900130097e-05, + "loss": 2.9348, + "step": 2631000 + }, + { + "epoch": 0.8180400922024288, + "grad_norm": 9.04787540435791, + "learning_rate": 3.6365998463292855e-05, + "loss": 2.9224, + "step": 2631500 + }, + { + "epoch": 0.8181955244829157, + "grad_norm": 7.02747917175293, + "learning_rate": 3.636340792528474e-05, + "loss": 2.9157, + "step": 2632000 + }, + { + "epoch": 0.8183509567634025, + "grad_norm": 8.639720916748047, + "learning_rate": 3.636081738727662e-05, + "loss": 2.9153, + "step": 2632500 + }, + { + "epoch": 0.8185063890438894, + "grad_norm": 7.28797721862793, + "learning_rate": 3.635822684926851e-05, + "loss": 2.9194, + "step": 2633000 + }, + { + "epoch": 0.8186618213243763, + "grad_norm": 9.205387115478516, + "learning_rate": 3.6355636311260396e-05, + "loss": 2.8938, + "step": 2633500 + }, + { + "epoch": 0.8188172536048631, + "grad_norm": 8.48323917388916, + "learning_rate": 3.6353045773252284e-05, + "loss": 2.9062, + "step": 2634000 + }, + { + "epoch": 0.81897268588535, + "grad_norm": 8.40034008026123, + "learning_rate": 3.635045523524417e-05, + "loss": 2.8801, + "step": 2634500 + }, + { + "epoch": 0.819128118165837, + "grad_norm": 13.71512508392334, + "learning_rate": 3.634786469723605e-05, + "loss": 2.9284, + "step": 2635000 + }, + { + "epoch": 0.8192835504463238, + "grad_norm": 9.585257530212402, + "learning_rate": 3.634527415922794e-05, + "loss": 2.9179, + "step": 2635500 + }, + { + "epoch": 0.8194389827268107, + "grad_norm": 13.788865089416504, + "learning_rate": 3.6342683621219825e-05, + "loss": 2.9145, + "step": 2636000 + }, + { + "epoch": 0.8195944150072976, + "grad_norm": 12.37924861907959, + "learning_rate": 3.6340093083211706e-05, + "loss": 2.8751, + "step": 2636500 + }, + { + "epoch": 0.8197498472877844, + "grad_norm": 11.36154842376709, + "learning_rate": 3.633750254520359e-05, + "loss": 2.9012, + "step": 2637000 + }, + { + "epoch": 0.8199052795682713, + "grad_norm": 9.348031044006348, + "learning_rate": 3.633491200719548e-05, + "loss": 2.9292, + "step": 2637500 + }, + { + "epoch": 0.8200607118487582, + "grad_norm": 4.786617755889893, + "learning_rate": 3.633232146918736e-05, + "loss": 2.919, + "step": 2638000 + }, + { + "epoch": 0.820216144129245, + "grad_norm": 10.547809600830078, + "learning_rate": 3.6329730931179254e-05, + "loss": 2.9081, + "step": 2638500 + }, + { + "epoch": 0.8203715764097319, + "grad_norm": 10.314239501953125, + "learning_rate": 3.632714039317114e-05, + "loss": 2.9319, + "step": 2639000 + }, + { + "epoch": 0.8205270086902188, + "grad_norm": 7.018503189086914, + "learning_rate": 3.632454985516302e-05, + "loss": 2.8891, + "step": 2639500 + }, + { + "epoch": 0.8206824409707056, + "grad_norm": 11.096253395080566, + "learning_rate": 3.632195931715491e-05, + "loss": 2.8933, + "step": 2640000 + }, + { + "epoch": 0.8208378732511925, + "grad_norm": 10.773820877075195, + "learning_rate": 3.6319368779146796e-05, + "loss": 2.8941, + "step": 2640500 + }, + { + "epoch": 0.8209933055316795, + "grad_norm": 8.942464828491211, + "learning_rate": 3.6316778241138677e-05, + "loss": 2.8868, + "step": 2641000 + }, + { + "epoch": 0.8211487378121664, + "grad_norm": 9.417928695678711, + "learning_rate": 3.6314187703130564e-05, + "loss": 2.9137, + "step": 2641500 + }, + { + "epoch": 0.8213041700926532, + "grad_norm": 11.588916778564453, + "learning_rate": 3.6311597165122444e-05, + "loss": 2.91, + "step": 2642000 + }, + { + "epoch": 0.8214596023731401, + "grad_norm": 8.757369041442871, + "learning_rate": 3.630900662711433e-05, + "loss": 2.9247, + "step": 2642500 + }, + { + "epoch": 0.821615034653627, + "grad_norm": 7.686764717102051, + "learning_rate": 3.630641608910622e-05, + "loss": 2.8844, + "step": 2643000 + }, + { + "epoch": 0.8217704669341138, + "grad_norm": 20.543628692626953, + "learning_rate": 3.6303825551098106e-05, + "loss": 2.9036, + "step": 2643500 + }, + { + "epoch": 0.8219258992146007, + "grad_norm": 22.679292678833008, + "learning_rate": 3.630123501308999e-05, + "loss": 2.9416, + "step": 2644000 + }, + { + "epoch": 0.8220813314950876, + "grad_norm": 8.73270320892334, + "learning_rate": 3.629864447508188e-05, + "loss": 2.9195, + "step": 2644500 + }, + { + "epoch": 0.8222367637755744, + "grad_norm": 9.61650562286377, + "learning_rate": 3.629605393707376e-05, + "loss": 2.9138, + "step": 2645000 + }, + { + "epoch": 0.8223921960560613, + "grad_norm": 8.038592338562012, + "learning_rate": 3.629346339906565e-05, + "loss": 2.8792, + "step": 2645500 + }, + { + "epoch": 0.8225476283365482, + "grad_norm": 8.375465393066406, + "learning_rate": 3.6290872861057535e-05, + "loss": 2.9107, + "step": 2646000 + }, + { + "epoch": 0.822703060617035, + "grad_norm": 7.720751762390137, + "learning_rate": 3.6288282323049415e-05, + "loss": 2.9069, + "step": 2646500 + }, + { + "epoch": 0.822858492897522, + "grad_norm": 12.065994262695312, + "learning_rate": 3.62856917850413e-05, + "loss": 2.9306, + "step": 2647000 + }, + { + "epoch": 0.8230139251780089, + "grad_norm": 9.303906440734863, + "learning_rate": 3.628310124703318e-05, + "loss": 2.8081, + "step": 2647500 + }, + { + "epoch": 0.8231693574584957, + "grad_norm": 9.273781776428223, + "learning_rate": 3.628051070902507e-05, + "loss": 2.9261, + "step": 2648000 + }, + { + "epoch": 0.8233247897389826, + "grad_norm": 11.999656677246094, + "learning_rate": 3.6277920171016964e-05, + "loss": 2.913, + "step": 2648500 + }, + { + "epoch": 0.8234802220194695, + "grad_norm": 6.800549030303955, + "learning_rate": 3.6275329633008844e-05, + "loss": 2.8937, + "step": 2649000 + }, + { + "epoch": 0.8236356542999563, + "grad_norm": 7.690309524536133, + "learning_rate": 3.627273909500073e-05, + "loss": 2.9041, + "step": 2649500 + }, + { + "epoch": 0.8237910865804432, + "grad_norm": 8.090387344360352, + "learning_rate": 3.627014855699262e-05, + "loss": 2.8987, + "step": 2650000 + }, + { + "epoch": 0.8239465188609301, + "grad_norm": 7.605876445770264, + "learning_rate": 3.62675580189845e-05, + "loss": 2.8836, + "step": 2650500 + }, + { + "epoch": 0.8241019511414169, + "grad_norm": 7.764209747314453, + "learning_rate": 3.6264967480976386e-05, + "loss": 2.9323, + "step": 2651000 + }, + { + "epoch": 0.8242573834219038, + "grad_norm": 7.338250160217285, + "learning_rate": 3.626237694296827e-05, + "loss": 2.892, + "step": 2651500 + }, + { + "epoch": 0.8244128157023907, + "grad_norm": 8.765608787536621, + "learning_rate": 3.625978640496015e-05, + "loss": 2.9108, + "step": 2652000 + }, + { + "epoch": 0.8245682479828775, + "grad_norm": 8.91552734375, + "learning_rate": 3.625719586695204e-05, + "loss": 2.8978, + "step": 2652500 + }, + { + "epoch": 0.8247236802633645, + "grad_norm": 14.080084800720215, + "learning_rate": 3.625460532894393e-05, + "loss": 2.9072, + "step": 2653000 + }, + { + "epoch": 0.8248791125438514, + "grad_norm": 7.335051536560059, + "learning_rate": 3.6252014790935815e-05, + "loss": 2.8957, + "step": 2653500 + }, + { + "epoch": 0.8250345448243382, + "grad_norm": 6.99788236618042, + "learning_rate": 3.62494242529277e-05, + "loss": 2.8744, + "step": 2654000 + }, + { + "epoch": 0.8251899771048251, + "grad_norm": 13.292312622070312, + "learning_rate": 3.624683371491958e-05, + "loss": 2.9268, + "step": 2654500 + }, + { + "epoch": 0.825345409385312, + "grad_norm": 38.3483772277832, + "learning_rate": 3.624424317691147e-05, + "loss": 2.9478, + "step": 2655000 + }, + { + "epoch": 0.8255008416657988, + "grad_norm": 9.475226402282715, + "learning_rate": 3.6241652638903357e-05, + "loss": 2.8976, + "step": 2655500 + }, + { + "epoch": 0.8256562739462857, + "grad_norm": 8.005743980407715, + "learning_rate": 3.623906210089524e-05, + "loss": 2.9123, + "step": 2656000 + }, + { + "epoch": 0.8258117062267726, + "grad_norm": 30.296119689941406, + "learning_rate": 3.6236471562887124e-05, + "loss": 2.9331, + "step": 2656500 + }, + { + "epoch": 0.8259671385072594, + "grad_norm": 8.340116500854492, + "learning_rate": 3.623388102487901e-05, + "loss": 2.8844, + "step": 2657000 + }, + { + "epoch": 0.8261225707877463, + "grad_norm": 6.992702960968018, + "learning_rate": 3.623129048687089e-05, + "loss": 2.8448, + "step": 2657500 + }, + { + "epoch": 0.8262780030682332, + "grad_norm": 8.17880916595459, + "learning_rate": 3.622869994886278e-05, + "loss": 2.9456, + "step": 2658000 + }, + { + "epoch": 0.82643343534872, + "grad_norm": 8.932817459106445, + "learning_rate": 3.622610941085467e-05, + "loss": 2.8939, + "step": 2658500 + }, + { + "epoch": 0.8265888676292069, + "grad_norm": 9.447041511535645, + "learning_rate": 3.622351887284655e-05, + "loss": 2.9251, + "step": 2659000 + }, + { + "epoch": 0.8267442999096939, + "grad_norm": 9.754058837890625, + "learning_rate": 3.622092833483844e-05, + "loss": 2.9064, + "step": 2659500 + }, + { + "epoch": 0.8268997321901808, + "grad_norm": 8.645689964294434, + "learning_rate": 3.621833779683032e-05, + "loss": 2.8804, + "step": 2660000 + }, + { + "epoch": 0.8270551644706676, + "grad_norm": 14.919047355651855, + "learning_rate": 3.621574725882221e-05, + "loss": 2.9268, + "step": 2660500 + }, + { + "epoch": 0.8272105967511545, + "grad_norm": 7.860114574432373, + "learning_rate": 3.6213156720814095e-05, + "loss": 2.8784, + "step": 2661000 + }, + { + "epoch": 0.8273660290316414, + "grad_norm": 7.784646987915039, + "learning_rate": 3.6210566182805975e-05, + "loss": 2.9041, + "step": 2661500 + }, + { + "epoch": 0.8275214613121282, + "grad_norm": 9.207144737243652, + "learning_rate": 3.620797564479786e-05, + "loss": 2.8405, + "step": 2662000 + }, + { + "epoch": 0.8276768935926151, + "grad_norm": 9.126748085021973, + "learning_rate": 3.620538510678975e-05, + "loss": 2.8924, + "step": 2662500 + }, + { + "epoch": 0.827832325873102, + "grad_norm": 8.657840728759766, + "learning_rate": 3.620279456878164e-05, + "loss": 2.9021, + "step": 2663000 + }, + { + "epoch": 0.8279877581535888, + "grad_norm": 15.080483436584473, + "learning_rate": 3.6200204030773524e-05, + "loss": 2.9426, + "step": 2663500 + }, + { + "epoch": 0.8281431904340757, + "grad_norm": 25.129642486572266, + "learning_rate": 3.619761349276541e-05, + "loss": 2.8867, + "step": 2664000 + }, + { + "epoch": 0.8282986227145626, + "grad_norm": 8.148141860961914, + "learning_rate": 3.619502295475729e-05, + "loss": 2.8497, + "step": 2664500 + }, + { + "epoch": 0.8284540549950494, + "grad_norm": 6.273400783538818, + "learning_rate": 3.619243241674918e-05, + "loss": 2.9002, + "step": 2665000 + }, + { + "epoch": 0.8286094872755364, + "grad_norm": 10.510276794433594, + "learning_rate": 3.618984187874106e-05, + "loss": 2.8704, + "step": 2665500 + }, + { + "epoch": 0.8287649195560233, + "grad_norm": 24.41478157043457, + "learning_rate": 3.6187251340732946e-05, + "loss": 2.8754, + "step": 2666000 + }, + { + "epoch": 0.8289203518365101, + "grad_norm": 12.121614456176758, + "learning_rate": 3.618466080272483e-05, + "loss": 2.8987, + "step": 2666500 + }, + { + "epoch": 0.829075784116997, + "grad_norm": 8.258788108825684, + "learning_rate": 3.6182070264716714e-05, + "loss": 2.904, + "step": 2667000 + }, + { + "epoch": 0.8292312163974839, + "grad_norm": 7.822555065155029, + "learning_rate": 3.61794797267086e-05, + "loss": 2.8725, + "step": 2667500 + }, + { + "epoch": 0.8293866486779707, + "grad_norm": 8.596210479736328, + "learning_rate": 3.617688918870049e-05, + "loss": 2.9282, + "step": 2668000 + }, + { + "epoch": 0.8295420809584576, + "grad_norm": 10.765729904174805, + "learning_rate": 3.6174298650692375e-05, + "loss": 2.9474, + "step": 2668500 + }, + { + "epoch": 0.8296975132389445, + "grad_norm": 9.257393836975098, + "learning_rate": 3.617170811268426e-05, + "loss": 2.9075, + "step": 2669000 + }, + { + "epoch": 0.8298529455194313, + "grad_norm": 8.504521369934082, + "learning_rate": 3.616911757467615e-05, + "loss": 2.9036, + "step": 2669500 + }, + { + "epoch": 0.8300083777999182, + "grad_norm": 8.896608352661133, + "learning_rate": 3.616652703666803e-05, + "loss": 2.9001, + "step": 2670000 + }, + { + "epoch": 0.8301638100804051, + "grad_norm": 7.838484764099121, + "learning_rate": 3.616393649865992e-05, + "loss": 2.8753, + "step": 2670500 + }, + { + "epoch": 0.8303192423608919, + "grad_norm": 7.643065929412842, + "learning_rate": 3.6161345960651804e-05, + "loss": 2.9587, + "step": 2671000 + }, + { + "epoch": 0.8304746746413789, + "grad_norm": 8.142481803894043, + "learning_rate": 3.6158755422643684e-05, + "loss": 2.9032, + "step": 2671500 + }, + { + "epoch": 0.8306301069218658, + "grad_norm": 7.876482963562012, + "learning_rate": 3.615616488463557e-05, + "loss": 2.9294, + "step": 2672000 + }, + { + "epoch": 0.8307855392023527, + "grad_norm": 10.209948539733887, + "learning_rate": 3.615357434662746e-05, + "loss": 2.9125, + "step": 2672500 + }, + { + "epoch": 0.8309409714828395, + "grad_norm": 8.520614624023438, + "learning_rate": 3.6150983808619346e-05, + "loss": 2.8736, + "step": 2673000 + }, + { + "epoch": 0.8310964037633264, + "grad_norm": 10.853599548339844, + "learning_rate": 3.614839327061123e-05, + "loss": 2.8914, + "step": 2673500 + }, + { + "epoch": 0.8312518360438133, + "grad_norm": 8.42007827758789, + "learning_rate": 3.6145802732603113e-05, + "loss": 2.8918, + "step": 2674000 + }, + { + "epoch": 0.8314072683243001, + "grad_norm": 7.948575973510742, + "learning_rate": 3.6143212194595e-05, + "loss": 2.8866, + "step": 2674500 + }, + { + "epoch": 0.831562700604787, + "grad_norm": 13.394796371459961, + "learning_rate": 3.614062165658689e-05, + "loss": 2.8729, + "step": 2675000 + }, + { + "epoch": 0.8317181328852739, + "grad_norm": 6.7534260749816895, + "learning_rate": 3.613803111857877e-05, + "loss": 2.9192, + "step": 2675500 + }, + { + "epoch": 0.8318735651657607, + "grad_norm": 8.063529968261719, + "learning_rate": 3.6135440580570655e-05, + "loss": 2.8886, + "step": 2676000 + }, + { + "epoch": 0.8320289974462476, + "grad_norm": 9.097535133361816, + "learning_rate": 3.613285004256254e-05, + "loss": 2.9217, + "step": 2676500 + }, + { + "epoch": 0.8321844297267345, + "grad_norm": 7.117336750030518, + "learning_rate": 3.613025950455442e-05, + "loss": 2.9451, + "step": 2677000 + }, + { + "epoch": 0.8323398620072214, + "grad_norm": 18.843019485473633, + "learning_rate": 3.612766896654631e-05, + "loss": 2.9043, + "step": 2677500 + }, + { + "epoch": 0.8324952942877083, + "grad_norm": 10.325230598449707, + "learning_rate": 3.61250784285382e-05, + "loss": 2.8717, + "step": 2678000 + }, + { + "epoch": 0.8326507265681952, + "grad_norm": 8.373753547668457, + "learning_rate": 3.6122487890530084e-05, + "loss": 2.8512, + "step": 2678500 + }, + { + "epoch": 0.832806158848682, + "grad_norm": 8.493638038635254, + "learning_rate": 3.611989735252197e-05, + "loss": 2.9145, + "step": 2679000 + }, + { + "epoch": 0.8329615911291689, + "grad_norm": 8.75475788116455, + "learning_rate": 3.611730681451385e-05, + "loss": 2.8709, + "step": 2679500 + }, + { + "epoch": 0.8331170234096558, + "grad_norm": 8.576288223266602, + "learning_rate": 3.611471627650574e-05, + "loss": 2.9142, + "step": 2680000 + }, + { + "epoch": 0.8332724556901426, + "grad_norm": 11.043195724487305, + "learning_rate": 3.6112125738497626e-05, + "loss": 2.9706, + "step": 2680500 + }, + { + "epoch": 0.8334278879706295, + "grad_norm": 8.78916072845459, + "learning_rate": 3.6109535200489507e-05, + "loss": 2.9084, + "step": 2681000 + }, + { + "epoch": 0.8335833202511164, + "grad_norm": 8.94179630279541, + "learning_rate": 3.6106944662481394e-05, + "loss": 2.8912, + "step": 2681500 + }, + { + "epoch": 0.8337387525316032, + "grad_norm": 7.221092700958252, + "learning_rate": 3.610435412447328e-05, + "loss": 2.9251, + "step": 2682000 + }, + { + "epoch": 0.8338941848120901, + "grad_norm": 6.512747287750244, + "learning_rate": 3.610176358646517e-05, + "loss": 2.8843, + "step": 2682500 + }, + { + "epoch": 0.834049617092577, + "grad_norm": 8.852471351623535, + "learning_rate": 3.6099173048457055e-05, + "loss": 2.9537, + "step": 2683000 + }, + { + "epoch": 0.834205049373064, + "grad_norm": 8.16919994354248, + "learning_rate": 3.6096582510448936e-05, + "loss": 2.9199, + "step": 2683500 + }, + { + "epoch": 0.8343604816535508, + "grad_norm": 9.130505561828613, + "learning_rate": 3.609399197244082e-05, + "loss": 2.8791, + "step": 2684000 + }, + { + "epoch": 0.8345159139340377, + "grad_norm": 7.880231857299805, + "learning_rate": 3.609140143443271e-05, + "loss": 2.8211, + "step": 2684500 + }, + { + "epoch": 0.8346713462145245, + "grad_norm": 14.385941505432129, + "learning_rate": 3.608881089642459e-05, + "loss": 2.9395, + "step": 2685000 + }, + { + "epoch": 0.8348267784950114, + "grad_norm": 8.703706741333008, + "learning_rate": 3.608622035841648e-05, + "loss": 2.9473, + "step": 2685500 + }, + { + "epoch": 0.8349822107754983, + "grad_norm": 8.545166015625, + "learning_rate": 3.6083629820408364e-05, + "loss": 2.9162, + "step": 2686000 + }, + { + "epoch": 0.8351376430559851, + "grad_norm": 9.615105628967285, + "learning_rate": 3.6081039282400245e-05, + "loss": 2.8801, + "step": 2686500 + }, + { + "epoch": 0.835293075336472, + "grad_norm": 7.824677467346191, + "learning_rate": 3.607844874439213e-05, + "loss": 2.8888, + "step": 2687000 + }, + { + "epoch": 0.8354485076169589, + "grad_norm": 10.865248680114746, + "learning_rate": 3.607585820638402e-05, + "loss": 2.9077, + "step": 2687500 + }, + { + "epoch": 0.8356039398974457, + "grad_norm": 8.428157806396484, + "learning_rate": 3.6073267668375906e-05, + "loss": 2.8977, + "step": 2688000 + }, + { + "epoch": 0.8357593721779326, + "grad_norm": 9.775031089782715, + "learning_rate": 3.6070677130367793e-05, + "loss": 2.8694, + "step": 2688500 + }, + { + "epoch": 0.8359148044584195, + "grad_norm": 8.011305809020996, + "learning_rate": 3.606808659235968e-05, + "loss": 2.8558, + "step": 2689000 + }, + { + "epoch": 0.8360702367389065, + "grad_norm": 6.231021881103516, + "learning_rate": 3.606549605435156e-05, + "loss": 2.8923, + "step": 2689500 + }, + { + "epoch": 0.8362256690193933, + "grad_norm": 6.836926460266113, + "learning_rate": 3.606290551634345e-05, + "loss": 2.9078, + "step": 2690000 + }, + { + "epoch": 0.8363811012998802, + "grad_norm": 12.069127082824707, + "learning_rate": 3.606031497833533e-05, + "loss": 2.9433, + "step": 2690500 + }, + { + "epoch": 0.8365365335803671, + "grad_norm": 8.507583618164062, + "learning_rate": 3.6057724440327216e-05, + "loss": 2.9113, + "step": 2691000 + }, + { + "epoch": 0.8366919658608539, + "grad_norm": 9.091086387634277, + "learning_rate": 3.60551339023191e-05, + "loss": 2.897, + "step": 2691500 + }, + { + "epoch": 0.8368473981413408, + "grad_norm": 18.484729766845703, + "learning_rate": 3.605254336431099e-05, + "loss": 2.8877, + "step": 2692000 + }, + { + "epoch": 0.8370028304218277, + "grad_norm": 13.864032745361328, + "learning_rate": 3.604995282630288e-05, + "loss": 2.882, + "step": 2692500 + }, + { + "epoch": 0.8371582627023145, + "grad_norm": 8.055971145629883, + "learning_rate": 3.6047362288294764e-05, + "loss": 2.9115, + "step": 2693000 + }, + { + "epoch": 0.8373136949828014, + "grad_norm": 7.352261066436768, + "learning_rate": 3.6044771750286645e-05, + "loss": 2.8576, + "step": 2693500 + }, + { + "epoch": 0.8374691272632883, + "grad_norm": 8.431221008300781, + "learning_rate": 3.604218121227853e-05, + "loss": 2.8872, + "step": 2694000 + }, + { + "epoch": 0.8376245595437751, + "grad_norm": 8.119096755981445, + "learning_rate": 3.603959067427042e-05, + "loss": 2.9238, + "step": 2694500 + }, + { + "epoch": 0.837779991824262, + "grad_norm": 21.548595428466797, + "learning_rate": 3.60370001362623e-05, + "loss": 2.9101, + "step": 2695000 + }, + { + "epoch": 0.837935424104749, + "grad_norm": 10.5781831741333, + "learning_rate": 3.6034409598254187e-05, + "loss": 2.913, + "step": 2695500 + }, + { + "epoch": 0.8380908563852358, + "grad_norm": 22.12555503845215, + "learning_rate": 3.603181906024607e-05, + "loss": 2.8603, + "step": 2696000 + }, + { + "epoch": 0.8382462886657227, + "grad_norm": 6.603510856628418, + "learning_rate": 3.6029228522237954e-05, + "loss": 2.9092, + "step": 2696500 + }, + { + "epoch": 0.8384017209462096, + "grad_norm": 16.29073143005371, + "learning_rate": 3.602663798422984e-05, + "loss": 2.8805, + "step": 2697000 + }, + { + "epoch": 0.8385571532266964, + "grad_norm": 11.2534818649292, + "learning_rate": 3.602404744622173e-05, + "loss": 2.9245, + "step": 2697500 + }, + { + "epoch": 0.8387125855071833, + "grad_norm": 9.67812442779541, + "learning_rate": 3.6021456908213616e-05, + "loss": 2.8551, + "step": 2698000 + }, + { + "epoch": 0.8388680177876702, + "grad_norm": 9.965039253234863, + "learning_rate": 3.60188663702055e-05, + "loss": 2.91, + "step": 2698500 + }, + { + "epoch": 0.839023450068157, + "grad_norm": 9.082486152648926, + "learning_rate": 3.601627583219738e-05, + "loss": 2.8985, + "step": 2699000 + }, + { + "epoch": 0.8391788823486439, + "grad_norm": 9.941302299499512, + "learning_rate": 3.601368529418927e-05, + "loss": 2.8478, + "step": 2699500 + }, + { + "epoch": 0.8393343146291308, + "grad_norm": 7.516843795776367, + "learning_rate": 3.601109475618116e-05, + "loss": 2.8958, + "step": 2700000 + }, + { + "epoch": 0.8394897469096176, + "grad_norm": 10.338723182678223, + "learning_rate": 3.600850421817304e-05, + "loss": 2.9105, + "step": 2700500 + }, + { + "epoch": 0.8396451791901045, + "grad_norm": 6.470137596130371, + "learning_rate": 3.6005913680164925e-05, + "loss": 2.8536, + "step": 2701000 + }, + { + "epoch": 0.8398006114705915, + "grad_norm": 10.754190444946289, + "learning_rate": 3.6003323142156805e-05, + "loss": 2.9176, + "step": 2701500 + }, + { + "epoch": 0.8399560437510784, + "grad_norm": 8.83544635772705, + "learning_rate": 3.60007326041487e-05, + "loss": 2.9362, + "step": 2702000 + }, + { + "epoch": 0.8401114760315652, + "grad_norm": 9.34730339050293, + "learning_rate": 3.5998142066140586e-05, + "loss": 2.9377, + "step": 2702500 + }, + { + "epoch": 0.8402669083120521, + "grad_norm": 9.792076110839844, + "learning_rate": 3.599555152813247e-05, + "loss": 2.984, + "step": 2703000 + }, + { + "epoch": 0.840422340592539, + "grad_norm": 11.101140975952148, + "learning_rate": 3.5992960990124354e-05, + "loss": 2.903, + "step": 2703500 + }, + { + "epoch": 0.8405777728730258, + "grad_norm": 8.22667121887207, + "learning_rate": 3.599037045211624e-05, + "loss": 2.9188, + "step": 2704000 + }, + { + "epoch": 0.8407332051535127, + "grad_norm": 10.655871391296387, + "learning_rate": 3.598777991410812e-05, + "loss": 2.8759, + "step": 2704500 + }, + { + "epoch": 0.8408886374339996, + "grad_norm": 9.100309371948242, + "learning_rate": 3.598518937610001e-05, + "loss": 2.9191, + "step": 2705000 + }, + { + "epoch": 0.8410440697144864, + "grad_norm": 8.276202201843262, + "learning_rate": 3.5982598838091896e-05, + "loss": 2.9064, + "step": 2705500 + }, + { + "epoch": 0.8411995019949733, + "grad_norm": 45.93406295776367, + "learning_rate": 3.5980008300083776e-05, + "loss": 2.8928, + "step": 2706000 + }, + { + "epoch": 0.8413549342754602, + "grad_norm": 9.097641944885254, + "learning_rate": 3.597741776207566e-05, + "loss": 2.8991, + "step": 2706500 + }, + { + "epoch": 0.841510366555947, + "grad_norm": 9.251473426818848, + "learning_rate": 3.597482722406755e-05, + "loss": 2.9265, + "step": 2707000 + }, + { + "epoch": 0.841665798836434, + "grad_norm": 9.631978988647461, + "learning_rate": 3.597223668605944e-05, + "loss": 2.9086, + "step": 2707500 + }, + { + "epoch": 0.8418212311169209, + "grad_norm": 10.545916557312012, + "learning_rate": 3.5969646148051325e-05, + "loss": 2.9386, + "step": 2708000 + }, + { + "epoch": 0.8419766633974077, + "grad_norm": 24.518482208251953, + "learning_rate": 3.5967055610043205e-05, + "loss": 2.9186, + "step": 2708500 + }, + { + "epoch": 0.8421320956778946, + "grad_norm": 16.236053466796875, + "learning_rate": 3.596446507203509e-05, + "loss": 2.8673, + "step": 2709000 + }, + { + "epoch": 0.8422875279583815, + "grad_norm": 10.894280433654785, + "learning_rate": 3.596187453402698e-05, + "loss": 2.9085, + "step": 2709500 + }, + { + "epoch": 0.8424429602388683, + "grad_norm": 8.925220489501953, + "learning_rate": 3.595928399601886e-05, + "loss": 2.8803, + "step": 2710000 + }, + { + "epoch": 0.8425983925193552, + "grad_norm": 11.142158508300781, + "learning_rate": 3.595669345801075e-05, + "loss": 2.9289, + "step": 2710500 + }, + { + "epoch": 0.8427538247998421, + "grad_norm": 11.742486953735352, + "learning_rate": 3.5954102920002634e-05, + "loss": 2.8786, + "step": 2711000 + }, + { + "epoch": 0.8429092570803289, + "grad_norm": 23.220542907714844, + "learning_rate": 3.5951512381994514e-05, + "loss": 2.8803, + "step": 2711500 + }, + { + "epoch": 0.8430646893608158, + "grad_norm": 33.82659149169922, + "learning_rate": 3.594892184398641e-05, + "loss": 2.8857, + "step": 2712000 + }, + { + "epoch": 0.8432201216413027, + "grad_norm": 9.338218688964844, + "learning_rate": 3.5946331305978296e-05, + "loss": 2.9158, + "step": 2712500 + }, + { + "epoch": 0.8433755539217895, + "grad_norm": 6.608675479888916, + "learning_rate": 3.5943740767970176e-05, + "loss": 2.8996, + "step": 2713000 + }, + { + "epoch": 0.8435309862022765, + "grad_norm": 83.25355529785156, + "learning_rate": 3.594115022996206e-05, + "loss": 2.8954, + "step": 2713500 + }, + { + "epoch": 0.8436864184827634, + "grad_norm": 8.700485229492188, + "learning_rate": 3.5938559691953943e-05, + "loss": 2.9191, + "step": 2714000 + }, + { + "epoch": 0.8438418507632502, + "grad_norm": 12.736223220825195, + "learning_rate": 3.593596915394583e-05, + "loss": 2.8872, + "step": 2714500 + }, + { + "epoch": 0.8439972830437371, + "grad_norm": 7.700325965881348, + "learning_rate": 3.593337861593772e-05, + "loss": 2.9035, + "step": 2715000 + }, + { + "epoch": 0.844152715324224, + "grad_norm": 9.293882369995117, + "learning_rate": 3.59307880779296e-05, + "loss": 2.9054, + "step": 2715500 + }, + { + "epoch": 0.8443081476047108, + "grad_norm": 8.801342964172363, + "learning_rate": 3.5928197539921485e-05, + "loss": 2.9135, + "step": 2716000 + }, + { + "epoch": 0.8444635798851977, + "grad_norm": 8.74396800994873, + "learning_rate": 3.592560700191337e-05, + "loss": 2.9202, + "step": 2716500 + }, + { + "epoch": 0.8446190121656846, + "grad_norm": 8.045878410339355, + "learning_rate": 3.592301646390526e-05, + "loss": 2.9425, + "step": 2717000 + }, + { + "epoch": 0.8447744444461714, + "grad_norm": 6.898787975311279, + "learning_rate": 3.592042592589715e-05, + "loss": 2.9371, + "step": 2717500 + }, + { + "epoch": 0.8449298767266583, + "grad_norm": 7.00898551940918, + "learning_rate": 3.5917835387889034e-05, + "loss": 2.9346, + "step": 2718000 + }, + { + "epoch": 0.8450853090071452, + "grad_norm": 7.901123046875, + "learning_rate": 3.5915244849880914e-05, + "loss": 2.911, + "step": 2718500 + }, + { + "epoch": 0.845240741287632, + "grad_norm": 6.115572929382324, + "learning_rate": 3.59126543118728e-05, + "loss": 2.9135, + "step": 2719000 + }, + { + "epoch": 0.845396173568119, + "grad_norm": 7.008376598358154, + "learning_rate": 3.591006377386468e-05, + "loss": 2.8877, + "step": 2719500 + }, + { + "epoch": 0.8455516058486059, + "grad_norm": 7.111575603485107, + "learning_rate": 3.590747323585657e-05, + "loss": 2.9033, + "step": 2720000 + }, + { + "epoch": 0.8457070381290928, + "grad_norm": 9.2915678024292, + "learning_rate": 3.5904882697848456e-05, + "loss": 2.8909, + "step": 2720500 + }, + { + "epoch": 0.8458624704095796, + "grad_norm": 8.912145614624023, + "learning_rate": 3.5902292159840336e-05, + "loss": 2.917, + "step": 2721000 + }, + { + "epoch": 0.8460179026900665, + "grad_norm": 10.489070892333984, + "learning_rate": 3.5899701621832224e-05, + "loss": 2.8654, + "step": 2721500 + }, + { + "epoch": 0.8461733349705534, + "grad_norm": 6.858118057250977, + "learning_rate": 3.589711108382412e-05, + "loss": 2.9041, + "step": 2722000 + }, + { + "epoch": 0.8463287672510402, + "grad_norm": 8.790522575378418, + "learning_rate": 3.5894520545816e-05, + "loss": 2.8536, + "step": 2722500 + }, + { + "epoch": 0.8464841995315271, + "grad_norm": 9.22787857055664, + "learning_rate": 3.5891930007807885e-05, + "loss": 2.9113, + "step": 2723000 + }, + { + "epoch": 0.846639631812014, + "grad_norm": 9.040449142456055, + "learning_rate": 3.588933946979977e-05, + "loss": 2.911, + "step": 2723500 + }, + { + "epoch": 0.8467950640925008, + "grad_norm": 9.081884384155273, + "learning_rate": 3.588674893179165e-05, + "loss": 2.8906, + "step": 2724000 + }, + { + "epoch": 0.8469504963729877, + "grad_norm": 9.854942321777344, + "learning_rate": 3.588415839378354e-05, + "loss": 2.9003, + "step": 2724500 + }, + { + "epoch": 0.8471059286534746, + "grad_norm": 8.802563667297363, + "learning_rate": 3.588156785577543e-05, + "loss": 2.8887, + "step": 2725000 + }, + { + "epoch": 0.8472613609339615, + "grad_norm": 9.31278133392334, + "learning_rate": 3.587897731776731e-05, + "loss": 2.8666, + "step": 2725500 + }, + { + "epoch": 0.8474167932144484, + "grad_norm": 9.674457550048828, + "learning_rate": 3.5876386779759194e-05, + "loss": 2.9021, + "step": 2726000 + }, + { + "epoch": 0.8475722254949353, + "grad_norm": 10.083439826965332, + "learning_rate": 3.587379624175108e-05, + "loss": 2.8885, + "step": 2726500 + }, + { + "epoch": 0.8477276577754221, + "grad_norm": 8.478184700012207, + "learning_rate": 3.587120570374297e-05, + "loss": 2.8868, + "step": 2727000 + }, + { + "epoch": 0.847883090055909, + "grad_norm": 8.124077796936035, + "learning_rate": 3.5868615165734856e-05, + "loss": 2.9397, + "step": 2727500 + }, + { + "epoch": 0.8480385223363959, + "grad_norm": 22.6605281829834, + "learning_rate": 3.5866024627726736e-05, + "loss": 2.9043, + "step": 2728000 + }, + { + "epoch": 0.8481939546168827, + "grad_norm": 9.389463424682617, + "learning_rate": 3.5863434089718623e-05, + "loss": 2.879, + "step": 2728500 + }, + { + "epoch": 0.8483493868973696, + "grad_norm": 8.534255981445312, + "learning_rate": 3.586084355171051e-05, + "loss": 2.903, + "step": 2729000 + }, + { + "epoch": 0.8485048191778565, + "grad_norm": 7.759302616119385, + "learning_rate": 3.585825301370239e-05, + "loss": 2.9392, + "step": 2729500 + }, + { + "epoch": 0.8486602514583433, + "grad_norm": 11.489712715148926, + "learning_rate": 3.585566247569428e-05, + "loss": 2.889, + "step": 2730000 + }, + { + "epoch": 0.8488156837388302, + "grad_norm": 10.659232139587402, + "learning_rate": 3.5853071937686165e-05, + "loss": 2.9053, + "step": 2730500 + }, + { + "epoch": 0.8489711160193171, + "grad_norm": 22.143953323364258, + "learning_rate": 3.5850481399678046e-05, + "loss": 2.9035, + "step": 2731000 + }, + { + "epoch": 0.849126548299804, + "grad_norm": 19.86590576171875, + "learning_rate": 3.584789086166993e-05, + "loss": 2.8644, + "step": 2731500 + }, + { + "epoch": 0.8492819805802909, + "grad_norm": 7.1596760749816895, + "learning_rate": 3.584530032366182e-05, + "loss": 2.8866, + "step": 2732000 + }, + { + "epoch": 0.8494374128607778, + "grad_norm": 7.973911762237549, + "learning_rate": 3.584270978565371e-05, + "loss": 2.8798, + "step": 2732500 + }, + { + "epoch": 0.8495928451412647, + "grad_norm": 9.437705039978027, + "learning_rate": 3.5840119247645594e-05, + "loss": 2.9217, + "step": 2733000 + }, + { + "epoch": 0.8497482774217515, + "grad_norm": 9.176277160644531, + "learning_rate": 3.5837528709637475e-05, + "loss": 2.8563, + "step": 2733500 + }, + { + "epoch": 0.8499037097022384, + "grad_norm": 7.56258487701416, + "learning_rate": 3.583493817162936e-05, + "loss": 2.8705, + "step": 2734000 + }, + { + "epoch": 0.8500591419827253, + "grad_norm": 11.750716209411621, + "learning_rate": 3.583234763362125e-05, + "loss": 2.8659, + "step": 2734500 + }, + { + "epoch": 0.8502145742632121, + "grad_norm": 8.348365783691406, + "learning_rate": 3.582975709561313e-05, + "loss": 2.8304, + "step": 2735000 + }, + { + "epoch": 0.850370006543699, + "grad_norm": 5.893183708190918, + "learning_rate": 3.5827166557605016e-05, + "loss": 2.9441, + "step": 2735500 + }, + { + "epoch": 0.8505254388241859, + "grad_norm": 11.876937866210938, + "learning_rate": 3.5824576019596904e-05, + "loss": 2.8731, + "step": 2736000 + }, + { + "epoch": 0.8506808711046727, + "grad_norm": 7.17368221282959, + "learning_rate": 3.582198548158879e-05, + "loss": 2.9066, + "step": 2736500 + }, + { + "epoch": 0.8508363033851596, + "grad_norm": 8.691390037536621, + "learning_rate": 3.581939494358068e-05, + "loss": 2.8403, + "step": 2737000 + }, + { + "epoch": 0.8509917356656466, + "grad_norm": 10.021785736083984, + "learning_rate": 3.581680440557256e-05, + "loss": 2.863, + "step": 2737500 + }, + { + "epoch": 0.8511471679461334, + "grad_norm": 14.63551139831543, + "learning_rate": 3.5814213867564445e-05, + "loss": 2.9024, + "step": 2738000 + }, + { + "epoch": 0.8513026002266203, + "grad_norm": 8.082255363464355, + "learning_rate": 3.581162332955633e-05, + "loss": 2.8705, + "step": 2738500 + }, + { + "epoch": 0.8514580325071072, + "grad_norm": 6.266722202301025, + "learning_rate": 3.580903279154821e-05, + "loss": 2.9026, + "step": 2739000 + }, + { + "epoch": 0.851613464787594, + "grad_norm": 6.978199005126953, + "learning_rate": 3.58064422535401e-05, + "loss": 2.8787, + "step": 2739500 + }, + { + "epoch": 0.8517688970680809, + "grad_norm": 6.436660289764404, + "learning_rate": 3.580385171553199e-05, + "loss": 2.8639, + "step": 2740000 + }, + { + "epoch": 0.8519243293485678, + "grad_norm": 9.584094047546387, + "learning_rate": 3.580126117752387e-05, + "loss": 2.8739, + "step": 2740500 + }, + { + "epoch": 0.8520797616290546, + "grad_norm": 10.707369804382324, + "learning_rate": 3.5798670639515755e-05, + "loss": 2.8787, + "step": 2741000 + }, + { + "epoch": 0.8522351939095415, + "grad_norm": 9.91788387298584, + "learning_rate": 3.579608010150764e-05, + "loss": 2.918, + "step": 2741500 + }, + { + "epoch": 0.8523906261900284, + "grad_norm": 9.657798767089844, + "learning_rate": 3.579348956349953e-05, + "loss": 2.8532, + "step": 2742000 + }, + { + "epoch": 0.8525460584705152, + "grad_norm": 19.995868682861328, + "learning_rate": 3.5790899025491416e-05, + "loss": 2.8418, + "step": 2742500 + }, + { + "epoch": 0.8527014907510021, + "grad_norm": 9.997428894042969, + "learning_rate": 3.5788308487483303e-05, + "loss": 2.9092, + "step": 2743000 + }, + { + "epoch": 0.8528569230314891, + "grad_norm": 9.11066722869873, + "learning_rate": 3.5785717949475184e-05, + "loss": 2.9148, + "step": 2743500 + }, + { + "epoch": 0.853012355311976, + "grad_norm": 13.265854835510254, + "learning_rate": 3.578312741146707e-05, + "loss": 2.9553, + "step": 2744000 + }, + { + "epoch": 0.8531677875924628, + "grad_norm": 10.21873664855957, + "learning_rate": 3.578053687345895e-05, + "loss": 2.8446, + "step": 2744500 + }, + { + "epoch": 0.8533232198729497, + "grad_norm": 10.863256454467773, + "learning_rate": 3.577794633545084e-05, + "loss": 2.9246, + "step": 2745000 + }, + { + "epoch": 0.8534786521534365, + "grad_norm": 10.758657455444336, + "learning_rate": 3.5775355797442726e-05, + "loss": 2.8943, + "step": 2745500 + }, + { + "epoch": 0.8536340844339234, + "grad_norm": 9.085600852966309, + "learning_rate": 3.577276525943461e-05, + "loss": 2.9171, + "step": 2746000 + }, + { + "epoch": 0.8537895167144103, + "grad_norm": 7.88003396987915, + "learning_rate": 3.57701747214265e-05, + "loss": 2.8903, + "step": 2746500 + }, + { + "epoch": 0.8539449489948971, + "grad_norm": 8.579583168029785, + "learning_rate": 3.576758418341839e-05, + "loss": 2.8959, + "step": 2747000 + }, + { + "epoch": 0.854100381275384, + "grad_norm": 8.504904747009277, + "learning_rate": 3.576499364541027e-05, + "loss": 2.9089, + "step": 2747500 + }, + { + "epoch": 0.8542558135558709, + "grad_norm": 8.204628944396973, + "learning_rate": 3.5762403107402155e-05, + "loss": 2.9032, + "step": 2748000 + }, + { + "epoch": 0.8544112458363577, + "grad_norm": 10.464776039123535, + "learning_rate": 3.575981256939404e-05, + "loss": 2.9008, + "step": 2748500 + }, + { + "epoch": 0.8545666781168446, + "grad_norm": 8.339519500732422, + "learning_rate": 3.575722203138592e-05, + "loss": 2.8567, + "step": 2749000 + }, + { + "epoch": 0.8547221103973316, + "grad_norm": 9.317358016967773, + "learning_rate": 3.575463149337781e-05, + "loss": 2.9403, + "step": 2749500 + }, + { + "epoch": 0.8548775426778185, + "grad_norm": 6.9258036613464355, + "learning_rate": 3.575204095536969e-05, + "loss": 2.9187, + "step": 2750000 + }, + { + "epoch": 0.8550329749583053, + "grad_norm": 8.011563301086426, + "learning_rate": 3.574945041736158e-05, + "loss": 2.8996, + "step": 2750500 + }, + { + "epoch": 0.8551884072387922, + "grad_norm": 8.450611114501953, + "learning_rate": 3.5746859879353464e-05, + "loss": 2.9032, + "step": 2751000 + }, + { + "epoch": 0.8553438395192791, + "grad_norm": 16.246057510375977, + "learning_rate": 3.574426934134535e-05, + "loss": 2.8658, + "step": 2751500 + }, + { + "epoch": 0.8554992717997659, + "grad_norm": 7.512172222137451, + "learning_rate": 3.574167880333724e-05, + "loss": 2.9103, + "step": 2752000 + }, + { + "epoch": 0.8556547040802528, + "grad_norm": 18.887380599975586, + "learning_rate": 3.5739088265329125e-05, + "loss": 2.8983, + "step": 2752500 + }, + { + "epoch": 0.8558101363607397, + "grad_norm": 9.228525161743164, + "learning_rate": 3.5736497727321006e-05, + "loss": 2.9247, + "step": 2753000 + }, + { + "epoch": 0.8559655686412265, + "grad_norm": 24.08043098449707, + "learning_rate": 3.573390718931289e-05, + "loss": 2.9058, + "step": 2753500 + }, + { + "epoch": 0.8561210009217134, + "grad_norm": 7.774896621704102, + "learning_rate": 3.573131665130478e-05, + "loss": 2.9013, + "step": 2754000 + }, + { + "epoch": 0.8562764332022003, + "grad_norm": 7.347177982330322, + "learning_rate": 3.572872611329666e-05, + "loss": 2.915, + "step": 2754500 + }, + { + "epoch": 0.8564318654826871, + "grad_norm": 7.6357340812683105, + "learning_rate": 3.572613557528855e-05, + "loss": 2.9166, + "step": 2755000 + }, + { + "epoch": 0.8565872977631741, + "grad_norm": 7.5273308753967285, + "learning_rate": 3.572354503728043e-05, + "loss": 2.8854, + "step": 2755500 + }, + { + "epoch": 0.856742730043661, + "grad_norm": 7.993826389312744, + "learning_rate": 3.572095449927232e-05, + "loss": 2.8831, + "step": 2756000 + }, + { + "epoch": 0.8568981623241478, + "grad_norm": 8.282853126525879, + "learning_rate": 3.571836396126421e-05, + "loss": 2.853, + "step": 2756500 + }, + { + "epoch": 0.8570535946046347, + "grad_norm": 8.443086624145508, + "learning_rate": 3.571577342325609e-05, + "loss": 2.9066, + "step": 2757000 + }, + { + "epoch": 0.8572090268851216, + "grad_norm": 10.04550838470459, + "learning_rate": 3.571318288524798e-05, + "loss": 2.8512, + "step": 2757500 + }, + { + "epoch": 0.8573644591656084, + "grad_norm": 13.564813613891602, + "learning_rate": 3.5710592347239864e-05, + "loss": 2.899, + "step": 2758000 + }, + { + "epoch": 0.8575198914460953, + "grad_norm": 15.931013107299805, + "learning_rate": 3.5708001809231744e-05, + "loss": 2.9286, + "step": 2758500 + }, + { + "epoch": 0.8576753237265822, + "grad_norm": 10.768771171569824, + "learning_rate": 3.570541127122363e-05, + "loss": 2.9037, + "step": 2759000 + }, + { + "epoch": 0.857830756007069, + "grad_norm": 7.755194187164307, + "learning_rate": 3.570282073321552e-05, + "loss": 2.9043, + "step": 2759500 + }, + { + "epoch": 0.8579861882875559, + "grad_norm": 8.764568328857422, + "learning_rate": 3.57002301952074e-05, + "loss": 2.924, + "step": 2760000 + }, + { + "epoch": 0.8581416205680428, + "grad_norm": 5.83616304397583, + "learning_rate": 3.5697639657199286e-05, + "loss": 2.9286, + "step": 2760500 + }, + { + "epoch": 0.8582970528485296, + "grad_norm": 12.37662410736084, + "learning_rate": 3.569504911919117e-05, + "loss": 2.8613, + "step": 2761000 + }, + { + "epoch": 0.8584524851290166, + "grad_norm": 9.182377815246582, + "learning_rate": 3.569245858118306e-05, + "loss": 2.8398, + "step": 2761500 + }, + { + "epoch": 0.8586079174095035, + "grad_norm": 8.531879425048828, + "learning_rate": 3.568986804317495e-05, + "loss": 2.9145, + "step": 2762000 + }, + { + "epoch": 0.8587633496899904, + "grad_norm": 11.822637557983398, + "learning_rate": 3.568727750516683e-05, + "loss": 2.8632, + "step": 2762500 + }, + { + "epoch": 0.8589187819704772, + "grad_norm": 9.294897079467773, + "learning_rate": 3.5684686967158715e-05, + "loss": 2.9099, + "step": 2763000 + }, + { + "epoch": 0.8590742142509641, + "grad_norm": 7.236846446990967, + "learning_rate": 3.56820964291506e-05, + "loss": 2.8341, + "step": 2763500 + }, + { + "epoch": 0.859229646531451, + "grad_norm": 7.176408290863037, + "learning_rate": 3.567950589114248e-05, + "loss": 2.8823, + "step": 2764000 + }, + { + "epoch": 0.8593850788119378, + "grad_norm": 10.663305282592773, + "learning_rate": 3.567691535313437e-05, + "loss": 2.8627, + "step": 2764500 + }, + { + "epoch": 0.8595405110924247, + "grad_norm": 8.249204635620117, + "learning_rate": 3.567432481512626e-05, + "loss": 2.8963, + "step": 2765000 + }, + { + "epoch": 0.8596959433729116, + "grad_norm": 9.250527381896973, + "learning_rate": 3.567173427711814e-05, + "loss": 2.8644, + "step": 2765500 + }, + { + "epoch": 0.8598513756533984, + "grad_norm": 8.934734344482422, + "learning_rate": 3.566914373911003e-05, + "loss": 2.8598, + "step": 2766000 + }, + { + "epoch": 0.8600068079338853, + "grad_norm": 32.2108154296875, + "learning_rate": 3.566655320110192e-05, + "loss": 2.9454, + "step": 2766500 + }, + { + "epoch": 0.8601622402143722, + "grad_norm": 10.028505325317383, + "learning_rate": 3.56639626630938e-05, + "loss": 2.894, + "step": 2767000 + }, + { + "epoch": 0.8603176724948591, + "grad_norm": 14.605171203613281, + "learning_rate": 3.5661372125085686e-05, + "loss": 2.9222, + "step": 2767500 + }, + { + "epoch": 0.860473104775346, + "grad_norm": 9.836601257324219, + "learning_rate": 3.5658781587077566e-05, + "loss": 2.9089, + "step": 2768000 + }, + { + "epoch": 0.8606285370558329, + "grad_norm": 6.774215221405029, + "learning_rate": 3.5656191049069453e-05, + "loss": 2.8912, + "step": 2768500 + }, + { + "epoch": 0.8607839693363197, + "grad_norm": 7.702279090881348, + "learning_rate": 3.565360051106134e-05, + "loss": 2.9275, + "step": 2769000 + }, + { + "epoch": 0.8609394016168066, + "grad_norm": 7.830057144165039, + "learning_rate": 3.565100997305322e-05, + "loss": 2.8798, + "step": 2769500 + }, + { + "epoch": 0.8610948338972935, + "grad_norm": 8.850997924804688, + "learning_rate": 3.564841943504511e-05, + "loss": 2.9239, + "step": 2770000 + }, + { + "epoch": 0.8612502661777803, + "grad_norm": 10.234784126281738, + "learning_rate": 3.5645828897036995e-05, + "loss": 2.8748, + "step": 2770500 + }, + { + "epoch": 0.8614056984582672, + "grad_norm": 7.038821697235107, + "learning_rate": 3.564323835902888e-05, + "loss": 2.9123, + "step": 2771000 + }, + { + "epoch": 0.8615611307387541, + "grad_norm": 10.578042030334473, + "learning_rate": 3.564064782102077e-05, + "loss": 2.8975, + "step": 2771500 + }, + { + "epoch": 0.8617165630192409, + "grad_norm": 7.1813154220581055, + "learning_rate": 3.563805728301266e-05, + "loss": 2.9121, + "step": 2772000 + }, + { + "epoch": 0.8618719952997278, + "grad_norm": 9.546144485473633, + "learning_rate": 3.563546674500454e-05, + "loss": 2.9092, + "step": 2772500 + }, + { + "epoch": 0.8620274275802147, + "grad_norm": 8.51128101348877, + "learning_rate": 3.5632876206996424e-05, + "loss": 2.882, + "step": 2773000 + }, + { + "epoch": 0.8621828598607016, + "grad_norm": 9.92890453338623, + "learning_rate": 3.5630285668988305e-05, + "loss": 2.8907, + "step": 2773500 + }, + { + "epoch": 0.8623382921411885, + "grad_norm": 27.517751693725586, + "learning_rate": 3.562769513098019e-05, + "loss": 2.86, + "step": 2774000 + }, + { + "epoch": 0.8624937244216754, + "grad_norm": 7.171834945678711, + "learning_rate": 3.562510459297208e-05, + "loss": 2.877, + "step": 2774500 + }, + { + "epoch": 0.8626491567021622, + "grad_norm": 8.461472511291504, + "learning_rate": 3.562251405496396e-05, + "loss": 2.9269, + "step": 2775000 + }, + { + "epoch": 0.8628045889826491, + "grad_norm": 8.425024032592773, + "learning_rate": 3.5619923516955846e-05, + "loss": 2.8945, + "step": 2775500 + }, + { + "epoch": 0.862960021263136, + "grad_norm": 11.086257934570312, + "learning_rate": 3.561733297894774e-05, + "loss": 2.8422, + "step": 2776000 + }, + { + "epoch": 0.8631154535436228, + "grad_norm": 9.793201446533203, + "learning_rate": 3.561474244093962e-05, + "loss": 2.9326, + "step": 2776500 + }, + { + "epoch": 0.8632708858241097, + "grad_norm": 6.046232223510742, + "learning_rate": 3.561215190293151e-05, + "loss": 2.8838, + "step": 2777000 + }, + { + "epoch": 0.8634263181045966, + "grad_norm": 9.035301208496094, + "learning_rate": 3.5609561364923395e-05, + "loss": 2.8632, + "step": 2777500 + }, + { + "epoch": 0.8635817503850834, + "grad_norm": 10.847846031188965, + "learning_rate": 3.5606970826915275e-05, + "loss": 2.8762, + "step": 2778000 + }, + { + "epoch": 0.8637371826655703, + "grad_norm": 23.954166412353516, + "learning_rate": 3.560438028890716e-05, + "loss": 2.8692, + "step": 2778500 + }, + { + "epoch": 0.8638926149460572, + "grad_norm": 11.018752098083496, + "learning_rate": 3.560178975089905e-05, + "loss": 2.874, + "step": 2779000 + }, + { + "epoch": 0.8640480472265442, + "grad_norm": 9.4205322265625, + "learning_rate": 3.559919921289093e-05, + "loss": 2.8641, + "step": 2779500 + }, + { + "epoch": 0.864203479507031, + "grad_norm": 14.108036994934082, + "learning_rate": 3.559660867488282e-05, + "loss": 2.8918, + "step": 2780000 + }, + { + "epoch": 0.8643589117875179, + "grad_norm": 13.693863868713379, + "learning_rate": 3.5594018136874704e-05, + "loss": 2.8813, + "step": 2780500 + }, + { + "epoch": 0.8645143440680048, + "grad_norm": 8.592875480651855, + "learning_rate": 3.559142759886659e-05, + "loss": 2.8938, + "step": 2781000 + }, + { + "epoch": 0.8646697763484916, + "grad_norm": 7.973937511444092, + "learning_rate": 3.558883706085848e-05, + "loss": 2.8856, + "step": 2781500 + }, + { + "epoch": 0.8648252086289785, + "grad_norm": 6.823058128356934, + "learning_rate": 3.558624652285036e-05, + "loss": 2.9389, + "step": 2782000 + }, + { + "epoch": 0.8649806409094654, + "grad_norm": 6.892411708831787, + "learning_rate": 3.5583655984842246e-05, + "loss": 2.8997, + "step": 2782500 + }, + { + "epoch": 0.8651360731899522, + "grad_norm": 6.818865776062012, + "learning_rate": 3.5581065446834133e-05, + "loss": 2.8453, + "step": 2783000 + }, + { + "epoch": 0.8652915054704391, + "grad_norm": 9.778997421264648, + "learning_rate": 3.5578474908826014e-05, + "loss": 2.9291, + "step": 2783500 + }, + { + "epoch": 0.865446937750926, + "grad_norm": 9.445834159851074, + "learning_rate": 3.55758843708179e-05, + "loss": 2.8971, + "step": 2784000 + }, + { + "epoch": 0.8656023700314128, + "grad_norm": 8.834925651550293, + "learning_rate": 3.557329383280979e-05, + "loss": 2.9299, + "step": 2784500 + }, + { + "epoch": 0.8657578023118997, + "grad_norm": 7.729705333709717, + "learning_rate": 3.557070329480167e-05, + "loss": 2.8493, + "step": 2785000 + }, + { + "epoch": 0.8659132345923867, + "grad_norm": 7.962647438049316, + "learning_rate": 3.5568112756793556e-05, + "loss": 2.909, + "step": 2785500 + }, + { + "epoch": 0.8660686668728735, + "grad_norm": 9.024605751037598, + "learning_rate": 3.556552221878544e-05, + "loss": 2.8983, + "step": 2786000 + }, + { + "epoch": 0.8662240991533604, + "grad_norm": 8.234389305114746, + "learning_rate": 3.556293168077733e-05, + "loss": 2.8912, + "step": 2786500 + }, + { + "epoch": 0.8663795314338473, + "grad_norm": 7.570312976837158, + "learning_rate": 3.556034114276922e-05, + "loss": 2.9208, + "step": 2787000 + }, + { + "epoch": 0.8665349637143341, + "grad_norm": 8.774163246154785, + "learning_rate": 3.55577506047611e-05, + "loss": 2.8992, + "step": 2787500 + }, + { + "epoch": 0.866690395994821, + "grad_norm": 18.262680053710938, + "learning_rate": 3.5555160066752985e-05, + "loss": 2.8753, + "step": 2788000 + }, + { + "epoch": 0.8668458282753079, + "grad_norm": 10.729936599731445, + "learning_rate": 3.555256952874487e-05, + "loss": 2.8776, + "step": 2788500 + }, + { + "epoch": 0.8670012605557947, + "grad_norm": 8.834573745727539, + "learning_rate": 3.554997899073675e-05, + "loss": 2.9098, + "step": 2789000 + }, + { + "epoch": 0.8671566928362816, + "grad_norm": 7.774954319000244, + "learning_rate": 3.554738845272864e-05, + "loss": 2.9309, + "step": 2789500 + }, + { + "epoch": 0.8673121251167685, + "grad_norm": 10.889226913452148, + "learning_rate": 3.5544797914720526e-05, + "loss": 2.8945, + "step": 2790000 + }, + { + "epoch": 0.8674675573972553, + "grad_norm": 14.972281455993652, + "learning_rate": 3.5542207376712414e-05, + "loss": 2.9448, + "step": 2790500 + }, + { + "epoch": 0.8676229896777422, + "grad_norm": 8.42730712890625, + "learning_rate": 3.55396168387043e-05, + "loss": 2.8519, + "step": 2791000 + }, + { + "epoch": 0.8677784219582291, + "grad_norm": 9.29999828338623, + "learning_rate": 3.553702630069619e-05, + "loss": 2.8949, + "step": 2791500 + }, + { + "epoch": 0.867933854238716, + "grad_norm": 7.692764759063721, + "learning_rate": 3.553443576268807e-05, + "loss": 2.861, + "step": 2792000 + }, + { + "epoch": 0.8680892865192029, + "grad_norm": 8.82462215423584, + "learning_rate": 3.5531845224679955e-05, + "loss": 2.8784, + "step": 2792500 + }, + { + "epoch": 0.8682447187996898, + "grad_norm": 8.987279891967773, + "learning_rate": 3.5529254686671836e-05, + "loss": 2.9047, + "step": 2793000 + }, + { + "epoch": 0.8684001510801767, + "grad_norm": 9.06315803527832, + "learning_rate": 3.552666414866372e-05, + "loss": 2.9073, + "step": 2793500 + }, + { + "epoch": 0.8685555833606635, + "grad_norm": 8.802444458007812, + "learning_rate": 3.552407361065561e-05, + "loss": 2.916, + "step": 2794000 + }, + { + "epoch": 0.8687110156411504, + "grad_norm": 22.079092025756836, + "learning_rate": 3.552148307264749e-05, + "loss": 2.8583, + "step": 2794500 + }, + { + "epoch": 0.8688664479216373, + "grad_norm": 8.600926399230957, + "learning_rate": 3.551889253463938e-05, + "loss": 2.9439, + "step": 2795000 + }, + { + "epoch": 0.8690218802021241, + "grad_norm": 9.172992706298828, + "learning_rate": 3.551630199663127e-05, + "loss": 2.8447, + "step": 2795500 + }, + { + "epoch": 0.869177312482611, + "grad_norm": 7.820998191833496, + "learning_rate": 3.551371145862315e-05, + "loss": 2.8617, + "step": 2796000 + }, + { + "epoch": 0.8693327447630979, + "grad_norm": 12.704341888427734, + "learning_rate": 3.551112092061504e-05, + "loss": 2.8821, + "step": 2796500 + }, + { + "epoch": 0.8694881770435847, + "grad_norm": 45.91801071166992, + "learning_rate": 3.5508530382606926e-05, + "loss": 2.8678, + "step": 2797000 + }, + { + "epoch": 0.8696436093240716, + "grad_norm": 14.257137298583984, + "learning_rate": 3.550593984459881e-05, + "loss": 2.9195, + "step": 2797500 + }, + { + "epoch": 0.8697990416045586, + "grad_norm": 19.652820587158203, + "learning_rate": 3.5503349306590694e-05, + "loss": 2.8931, + "step": 2798000 + }, + { + "epoch": 0.8699544738850454, + "grad_norm": 8.11089038848877, + "learning_rate": 3.5500758768582574e-05, + "loss": 2.9088, + "step": 2798500 + }, + { + "epoch": 0.8701099061655323, + "grad_norm": 7.282514572143555, + "learning_rate": 3.549816823057446e-05, + "loss": 2.8563, + "step": 2799000 + }, + { + "epoch": 0.8702653384460192, + "grad_norm": 6.660608768463135, + "learning_rate": 3.549557769256635e-05, + "loss": 2.9037, + "step": 2799500 + }, + { + "epoch": 0.870420770726506, + "grad_norm": 5.656056880950928, + "learning_rate": 3.5492987154558236e-05, + "loss": 2.8662, + "step": 2800000 + }, + { + "epoch": 0.8705762030069929, + "grad_norm": 7.244671821594238, + "learning_rate": 3.549039661655012e-05, + "loss": 2.9048, + "step": 2800500 + }, + { + "epoch": 0.8707316352874798, + "grad_norm": 21.035865783691406, + "learning_rate": 3.548780607854201e-05, + "loss": 2.8994, + "step": 2801000 + }, + { + "epoch": 0.8708870675679666, + "grad_norm": 6.330591201782227, + "learning_rate": 3.548521554053389e-05, + "loss": 2.8553, + "step": 2801500 + }, + { + "epoch": 0.8710424998484535, + "grad_norm": 12.777482032775879, + "learning_rate": 3.548262500252578e-05, + "loss": 2.8737, + "step": 2802000 + }, + { + "epoch": 0.8711979321289404, + "grad_norm": 9.03803539276123, + "learning_rate": 3.5480034464517665e-05, + "loss": 2.8575, + "step": 2802500 + }, + { + "epoch": 0.8713533644094272, + "grad_norm": 9.45576000213623, + "learning_rate": 3.5477443926509545e-05, + "loss": 2.8893, + "step": 2803000 + }, + { + "epoch": 0.8715087966899141, + "grad_norm": 7.648396015167236, + "learning_rate": 3.547485338850143e-05, + "loss": 2.8776, + "step": 2803500 + }, + { + "epoch": 0.8716642289704011, + "grad_norm": 7.476133823394775, + "learning_rate": 3.547226285049331e-05, + "loss": 2.8714, + "step": 2804000 + }, + { + "epoch": 0.871819661250888, + "grad_norm": 9.10485553741455, + "learning_rate": 3.54696723124852e-05, + "loss": 2.8892, + "step": 2804500 + }, + { + "epoch": 0.8719750935313748, + "grad_norm": 9.357739448547363, + "learning_rate": 3.546708177447709e-05, + "loss": 2.8608, + "step": 2805000 + }, + { + "epoch": 0.8721305258118617, + "grad_norm": 11.411481857299805, + "learning_rate": 3.5464491236468974e-05, + "loss": 2.8923, + "step": 2805500 + }, + { + "epoch": 0.8722859580923485, + "grad_norm": 46.191410064697266, + "learning_rate": 3.546190069846086e-05, + "loss": 2.937, + "step": 2806000 + }, + { + "epoch": 0.8724413903728354, + "grad_norm": 6.5958943367004395, + "learning_rate": 3.545931016045275e-05, + "loss": 2.9123, + "step": 2806500 + }, + { + "epoch": 0.8725968226533223, + "grad_norm": 8.211100578308105, + "learning_rate": 3.545671962244463e-05, + "loss": 2.8985, + "step": 2807000 + }, + { + "epoch": 0.8727522549338091, + "grad_norm": 7.7722249031066895, + "learning_rate": 3.5454129084436516e-05, + "loss": 2.8888, + "step": 2807500 + }, + { + "epoch": 0.872907687214296, + "grad_norm": 13.202592849731445, + "learning_rate": 3.54515385464284e-05, + "loss": 2.9146, + "step": 2808000 + }, + { + "epoch": 0.8730631194947829, + "grad_norm": 8.863336563110352, + "learning_rate": 3.544894800842028e-05, + "loss": 2.8966, + "step": 2808500 + }, + { + "epoch": 0.8732185517752697, + "grad_norm": 9.872209548950195, + "learning_rate": 3.544635747041217e-05, + "loss": 2.9225, + "step": 2809000 + }, + { + "epoch": 0.8733739840557566, + "grad_norm": 7.549551486968994, + "learning_rate": 3.544376693240406e-05, + "loss": 2.8914, + "step": 2809500 + }, + { + "epoch": 0.8735294163362436, + "grad_norm": 9.491878509521484, + "learning_rate": 3.5441176394395945e-05, + "loss": 2.8723, + "step": 2810000 + }, + { + "epoch": 0.8736848486167305, + "grad_norm": 8.414908409118652, + "learning_rate": 3.543858585638783e-05, + "loss": 2.8918, + "step": 2810500 + }, + { + "epoch": 0.8738402808972173, + "grad_norm": 9.375036239624023, + "learning_rate": 3.543599531837971e-05, + "loss": 2.8863, + "step": 2811000 + }, + { + "epoch": 0.8739957131777042, + "grad_norm": 9.095402717590332, + "learning_rate": 3.54334047803716e-05, + "loss": 2.9184, + "step": 2811500 + }, + { + "epoch": 0.8741511454581911, + "grad_norm": 8.566633224487305, + "learning_rate": 3.543081424236349e-05, + "loss": 2.9388, + "step": 2812000 + }, + { + "epoch": 0.8743065777386779, + "grad_norm": 5.411419868469238, + "learning_rate": 3.542822370435537e-05, + "loss": 2.9274, + "step": 2812500 + }, + { + "epoch": 0.8744620100191648, + "grad_norm": 10.599237442016602, + "learning_rate": 3.5425633166347254e-05, + "loss": 2.9088, + "step": 2813000 + }, + { + "epoch": 0.8746174422996517, + "grad_norm": 12.239956855773926, + "learning_rate": 3.542304262833914e-05, + "loss": 2.9103, + "step": 2813500 + }, + { + "epoch": 0.8747728745801385, + "grad_norm": 7.171097755432129, + "learning_rate": 3.542045209033102e-05, + "loss": 2.8675, + "step": 2814000 + }, + { + "epoch": 0.8749283068606254, + "grad_norm": 14.945538520812988, + "learning_rate": 3.541786155232291e-05, + "loss": 2.9161, + "step": 2814500 + }, + { + "epoch": 0.8750837391411123, + "grad_norm": 11.368090629577637, + "learning_rate": 3.5415271014314796e-05, + "loss": 2.8948, + "step": 2815000 + }, + { + "epoch": 0.8752391714215991, + "grad_norm": 8.912503242492676, + "learning_rate": 3.541268047630668e-05, + "loss": 2.937, + "step": 2815500 + }, + { + "epoch": 0.8753946037020861, + "grad_norm": 7.688236236572266, + "learning_rate": 3.541008993829857e-05, + "loss": 2.8741, + "step": 2816000 + }, + { + "epoch": 0.875550035982573, + "grad_norm": 9.099151611328125, + "learning_rate": 3.540749940029045e-05, + "loss": 2.9014, + "step": 2816500 + }, + { + "epoch": 0.8757054682630598, + "grad_norm": 9.037640571594238, + "learning_rate": 3.540490886228234e-05, + "loss": 2.932, + "step": 2817000 + }, + { + "epoch": 0.8758609005435467, + "grad_norm": 14.543749809265137, + "learning_rate": 3.5402318324274225e-05, + "loss": 2.8519, + "step": 2817500 + }, + { + "epoch": 0.8760163328240336, + "grad_norm": 47.388160705566406, + "learning_rate": 3.5399727786266105e-05, + "loss": 2.8806, + "step": 2818000 + }, + { + "epoch": 0.8761717651045204, + "grad_norm": 8.333434104919434, + "learning_rate": 3.539713724825799e-05, + "loss": 2.8879, + "step": 2818500 + }, + { + "epoch": 0.8763271973850073, + "grad_norm": 11.564810752868652, + "learning_rate": 3.539454671024988e-05, + "loss": 2.923, + "step": 2819000 + }, + { + "epoch": 0.8764826296654942, + "grad_norm": 35.051239013671875, + "learning_rate": 3.539195617224177e-05, + "loss": 2.8942, + "step": 2819500 + }, + { + "epoch": 0.876638061945981, + "grad_norm": 10.89872932434082, + "learning_rate": 3.5389365634233654e-05, + "loss": 2.9259, + "step": 2820000 + }, + { + "epoch": 0.8767934942264679, + "grad_norm": 7.723567962646484, + "learning_rate": 3.538677509622554e-05, + "loss": 2.9093, + "step": 2820500 + }, + { + "epoch": 0.8769489265069548, + "grad_norm": 7.490896224975586, + "learning_rate": 3.538418455821742e-05, + "loss": 2.879, + "step": 2821000 + }, + { + "epoch": 0.8771043587874416, + "grad_norm": 7.504796981811523, + "learning_rate": 3.538159402020931e-05, + "loss": 2.8704, + "step": 2821500 + }, + { + "epoch": 0.8772597910679286, + "grad_norm": 9.008115768432617, + "learning_rate": 3.537900348220119e-05, + "loss": 2.8706, + "step": 2822000 + }, + { + "epoch": 0.8774152233484155, + "grad_norm": 9.119512557983398, + "learning_rate": 3.5376412944193076e-05, + "loss": 2.936, + "step": 2822500 + }, + { + "epoch": 0.8775706556289024, + "grad_norm": 7.671011924743652, + "learning_rate": 3.5373822406184963e-05, + "loss": 2.8965, + "step": 2823000 + }, + { + "epoch": 0.8777260879093892, + "grad_norm": 8.471144676208496, + "learning_rate": 3.5371231868176844e-05, + "loss": 2.91, + "step": 2823500 + }, + { + "epoch": 0.8778815201898761, + "grad_norm": 8.531580924987793, + "learning_rate": 3.536864133016873e-05, + "loss": 2.8875, + "step": 2824000 + }, + { + "epoch": 0.878036952470363, + "grad_norm": 7.520972728729248, + "learning_rate": 3.536605079216062e-05, + "loss": 2.8552, + "step": 2824500 + }, + { + "epoch": 0.8781923847508498, + "grad_norm": 9.331132888793945, + "learning_rate": 3.5363460254152505e-05, + "loss": 2.8905, + "step": 2825000 + }, + { + "epoch": 0.8783478170313367, + "grad_norm": 48.562538146972656, + "learning_rate": 3.536086971614439e-05, + "loss": 2.8772, + "step": 2825500 + }, + { + "epoch": 0.8785032493118236, + "grad_norm": 8.560100555419922, + "learning_rate": 3.535827917813628e-05, + "loss": 2.8902, + "step": 2826000 + }, + { + "epoch": 0.8786586815923104, + "grad_norm": 8.502840995788574, + "learning_rate": 3.535568864012816e-05, + "loss": 2.9046, + "step": 2826500 + }, + { + "epoch": 0.8788141138727973, + "grad_norm": 8.80019760131836, + "learning_rate": 3.535309810212005e-05, + "loss": 2.8916, + "step": 2827000 + }, + { + "epoch": 0.8789695461532842, + "grad_norm": 20.781497955322266, + "learning_rate": 3.5350507564111934e-05, + "loss": 2.857, + "step": 2827500 + }, + { + "epoch": 0.8791249784337711, + "grad_norm": 7.81084680557251, + "learning_rate": 3.5347917026103815e-05, + "loss": 2.9008, + "step": 2828000 + }, + { + "epoch": 0.879280410714258, + "grad_norm": 9.13222599029541, + "learning_rate": 3.53453264880957e-05, + "loss": 2.9321, + "step": 2828500 + }, + { + "epoch": 0.8794358429947449, + "grad_norm": 16.651365280151367, + "learning_rate": 3.534273595008758e-05, + "loss": 2.9012, + "step": 2829000 + }, + { + "epoch": 0.8795912752752317, + "grad_norm": 10.039477348327637, + "learning_rate": 3.5340145412079476e-05, + "loss": 2.9154, + "step": 2829500 + }, + { + "epoch": 0.8797467075557186, + "grad_norm": 6.118091583251953, + "learning_rate": 3.533755487407136e-05, + "loss": 2.9492, + "step": 2830000 + }, + { + "epoch": 0.8799021398362055, + "grad_norm": 10.643876075744629, + "learning_rate": 3.5334964336063244e-05, + "loss": 2.8774, + "step": 2830500 + }, + { + "epoch": 0.8800575721166923, + "grad_norm": 9.078452110290527, + "learning_rate": 3.533237379805513e-05, + "loss": 2.906, + "step": 2831000 + }, + { + "epoch": 0.8802130043971792, + "grad_norm": 13.837822914123535, + "learning_rate": 3.532978326004702e-05, + "loss": 2.8659, + "step": 2831500 + }, + { + "epoch": 0.8803684366776661, + "grad_norm": 11.493168830871582, + "learning_rate": 3.53271927220389e-05, + "loss": 2.8674, + "step": 2832000 + }, + { + "epoch": 0.8805238689581529, + "grad_norm": 10.223165512084961, + "learning_rate": 3.5324602184030785e-05, + "loss": 2.9001, + "step": 2832500 + }, + { + "epoch": 0.8806793012386398, + "grad_norm": 7.459479808807373, + "learning_rate": 3.532201164602267e-05, + "loss": 2.9252, + "step": 2833000 + }, + { + "epoch": 0.8808347335191267, + "grad_norm": 8.258739471435547, + "learning_rate": 3.531942110801455e-05, + "loss": 2.8545, + "step": 2833500 + }, + { + "epoch": 0.8809901657996136, + "grad_norm": 9.50892448425293, + "learning_rate": 3.531683057000644e-05, + "loss": 2.9307, + "step": 2834000 + }, + { + "epoch": 0.8811455980801005, + "grad_norm": 9.476734161376953, + "learning_rate": 3.531424003199833e-05, + "loss": 2.9105, + "step": 2834500 + }, + { + "epoch": 0.8813010303605874, + "grad_norm": 8.841593742370605, + "learning_rate": 3.5311649493990214e-05, + "loss": 2.8712, + "step": 2835000 + }, + { + "epoch": 0.8814564626410742, + "grad_norm": 10.871893882751465, + "learning_rate": 3.53090589559821e-05, + "loss": 2.9039, + "step": 2835500 + }, + { + "epoch": 0.8816118949215611, + "grad_norm": 25.152111053466797, + "learning_rate": 3.530646841797398e-05, + "loss": 2.8907, + "step": 2836000 + }, + { + "epoch": 0.881767327202048, + "grad_norm": 8.209650993347168, + "learning_rate": 3.530387787996587e-05, + "loss": 2.9197, + "step": 2836500 + }, + { + "epoch": 0.8819227594825348, + "grad_norm": 7.986872673034668, + "learning_rate": 3.5301287341957756e-05, + "loss": 2.8406, + "step": 2837000 + }, + { + "epoch": 0.8820781917630217, + "grad_norm": 5.883378028869629, + "learning_rate": 3.529869680394964e-05, + "loss": 2.8952, + "step": 2837500 + }, + { + "epoch": 0.8822336240435086, + "grad_norm": 9.259705543518066, + "learning_rate": 3.5296106265941524e-05, + "loss": 2.8602, + "step": 2838000 + }, + { + "epoch": 0.8823890563239954, + "grad_norm": 8.250340461730957, + "learning_rate": 3.529351572793341e-05, + "loss": 2.903, + "step": 2838500 + }, + { + "epoch": 0.8825444886044823, + "grad_norm": 10.211949348449707, + "learning_rate": 3.529092518992529e-05, + "loss": 2.9332, + "step": 2839000 + }, + { + "epoch": 0.8826999208849692, + "grad_norm": 11.43673324584961, + "learning_rate": 3.5288334651917185e-05, + "loss": 2.8739, + "step": 2839500 + }, + { + "epoch": 0.8828553531654562, + "grad_norm": 8.74732494354248, + "learning_rate": 3.5285744113909066e-05, + "loss": 2.9033, + "step": 2840000 + }, + { + "epoch": 0.883010785445943, + "grad_norm": 9.252671241760254, + "learning_rate": 3.528315357590095e-05, + "loss": 2.854, + "step": 2840500 + }, + { + "epoch": 0.8831662177264299, + "grad_norm": 10.233160018920898, + "learning_rate": 3.528056303789284e-05, + "loss": 2.9067, + "step": 2841000 + }, + { + "epoch": 0.8833216500069168, + "grad_norm": 6.0126261711120605, + "learning_rate": 3.527797249988472e-05, + "loss": 2.9002, + "step": 2841500 + }, + { + "epoch": 0.8834770822874036, + "grad_norm": 9.654258728027344, + "learning_rate": 3.527538196187661e-05, + "loss": 2.8471, + "step": 2842000 + }, + { + "epoch": 0.8836325145678905, + "grad_norm": 29.954113006591797, + "learning_rate": 3.5272791423868495e-05, + "loss": 2.8829, + "step": 2842500 + }, + { + "epoch": 0.8837879468483774, + "grad_norm": 9.192647933959961, + "learning_rate": 3.5270200885860375e-05, + "loss": 2.9, + "step": 2843000 + }, + { + "epoch": 0.8839433791288642, + "grad_norm": 7.649967670440674, + "learning_rate": 3.526761034785226e-05, + "loss": 2.8786, + "step": 2843500 + }, + { + "epoch": 0.8840988114093511, + "grad_norm": 8.77890396118164, + "learning_rate": 3.526501980984415e-05, + "loss": 2.901, + "step": 2844000 + }, + { + "epoch": 0.884254243689838, + "grad_norm": 10.188101768493652, + "learning_rate": 3.5262429271836036e-05, + "loss": 2.901, + "step": 2844500 + }, + { + "epoch": 0.8844096759703248, + "grad_norm": 9.247663497924805, + "learning_rate": 3.5259838733827924e-05, + "loss": 2.8627, + "step": 2845000 + }, + { + "epoch": 0.8845651082508117, + "grad_norm": 8.360342025756836, + "learning_rate": 3.525724819581981e-05, + "loss": 2.9466, + "step": 2845500 + }, + { + "epoch": 0.8847205405312987, + "grad_norm": 9.132627487182617, + "learning_rate": 3.525465765781169e-05, + "loss": 2.8779, + "step": 2846000 + }, + { + "epoch": 0.8848759728117855, + "grad_norm": 6.747194766998291, + "learning_rate": 3.525206711980358e-05, + "loss": 2.9159, + "step": 2846500 + }, + { + "epoch": 0.8850314050922724, + "grad_norm": 8.49535083770752, + "learning_rate": 3.524947658179546e-05, + "loss": 2.889, + "step": 2847000 + }, + { + "epoch": 0.8851868373727593, + "grad_norm": 10.792284965515137, + "learning_rate": 3.5246886043787346e-05, + "loss": 2.9075, + "step": 2847500 + }, + { + "epoch": 0.8853422696532461, + "grad_norm": 6.6532111167907715, + "learning_rate": 3.524429550577923e-05, + "loss": 2.8645, + "step": 2848000 + }, + { + "epoch": 0.885497701933733, + "grad_norm": 9.126802444458008, + "learning_rate": 3.524170496777111e-05, + "loss": 2.8739, + "step": 2848500 + }, + { + "epoch": 0.8856531342142199, + "grad_norm": 14.427608489990234, + "learning_rate": 3.5239114429763e-05, + "loss": 2.9279, + "step": 2849000 + }, + { + "epoch": 0.8858085664947067, + "grad_norm": 9.869479179382324, + "learning_rate": 3.5236523891754894e-05, + "loss": 2.8973, + "step": 2849500 + }, + { + "epoch": 0.8859639987751936, + "grad_norm": 8.563736915588379, + "learning_rate": 3.5233933353746775e-05, + "loss": 2.9201, + "step": 2850000 + }, + { + "epoch": 0.8861194310556805, + "grad_norm": 8.553022384643555, + "learning_rate": 3.523134281573866e-05, + "loss": 2.855, + "step": 2850500 + }, + { + "epoch": 0.8862748633361673, + "grad_norm": 9.338300704956055, + "learning_rate": 3.522875227773055e-05, + "loss": 2.8839, + "step": 2851000 + }, + { + "epoch": 0.8864302956166542, + "grad_norm": 7.987532138824463, + "learning_rate": 3.522616173972243e-05, + "loss": 2.9042, + "step": 2851500 + }, + { + "epoch": 0.8865857278971412, + "grad_norm": 18.19179916381836, + "learning_rate": 3.522357120171432e-05, + "loss": 2.9047, + "step": 2852000 + }, + { + "epoch": 0.886741160177628, + "grad_norm": 7.489641189575195, + "learning_rate": 3.52209806637062e-05, + "loss": 2.9383, + "step": 2852500 + }, + { + "epoch": 0.8868965924581149, + "grad_norm": 7.27489709854126, + "learning_rate": 3.5218390125698084e-05, + "loss": 2.8869, + "step": 2853000 + }, + { + "epoch": 0.8870520247386018, + "grad_norm": 8.806863784790039, + "learning_rate": 3.521579958768997e-05, + "loss": 2.8777, + "step": 2853500 + }, + { + "epoch": 0.8872074570190887, + "grad_norm": 7.8056321144104, + "learning_rate": 3.521320904968186e-05, + "loss": 2.899, + "step": 2854000 + }, + { + "epoch": 0.8873628892995755, + "grad_norm": 11.442108154296875, + "learning_rate": 3.5210618511673746e-05, + "loss": 2.9059, + "step": 2854500 + }, + { + "epoch": 0.8875183215800624, + "grad_norm": 11.116972923278809, + "learning_rate": 3.520802797366563e-05, + "loss": 2.9446, + "step": 2855000 + }, + { + "epoch": 0.8876737538605493, + "grad_norm": 8.515215873718262, + "learning_rate": 3.520543743565751e-05, + "loss": 2.9006, + "step": 2855500 + }, + { + "epoch": 0.8878291861410361, + "grad_norm": 15.25403118133545, + "learning_rate": 3.52028468976494e-05, + "loss": 2.865, + "step": 2856000 + }, + { + "epoch": 0.887984618421523, + "grad_norm": 11.91422176361084, + "learning_rate": 3.520025635964129e-05, + "loss": 2.8963, + "step": 2856500 + }, + { + "epoch": 0.8881400507020099, + "grad_norm": 7.973917007446289, + "learning_rate": 3.519766582163317e-05, + "loss": 2.8547, + "step": 2857000 + }, + { + "epoch": 0.8882954829824967, + "grad_norm": 8.165667533874512, + "learning_rate": 3.5195075283625055e-05, + "loss": 2.8921, + "step": 2857500 + }, + { + "epoch": 0.8884509152629837, + "grad_norm": 8.58368968963623, + "learning_rate": 3.5192484745616935e-05, + "loss": 2.8651, + "step": 2858000 + }, + { + "epoch": 0.8886063475434706, + "grad_norm": 7.850372791290283, + "learning_rate": 3.518989420760882e-05, + "loss": 2.8938, + "step": 2858500 + }, + { + "epoch": 0.8887617798239574, + "grad_norm": 12.896697044372559, + "learning_rate": 3.518730366960071e-05, + "loss": 2.884, + "step": 2859000 + }, + { + "epoch": 0.8889172121044443, + "grad_norm": 8.550954818725586, + "learning_rate": 3.51847131315926e-05, + "loss": 2.8876, + "step": 2859500 + }, + { + "epoch": 0.8890726443849312, + "grad_norm": 8.978997230529785, + "learning_rate": 3.5182122593584484e-05, + "loss": 2.8736, + "step": 2860000 + }, + { + "epoch": 0.889228076665418, + "grad_norm": 8.699420928955078, + "learning_rate": 3.517953205557637e-05, + "loss": 2.8891, + "step": 2860500 + }, + { + "epoch": 0.8893835089459049, + "grad_norm": 6.563772678375244, + "learning_rate": 3.517694151756825e-05, + "loss": 2.9136, + "step": 2861000 + }, + { + "epoch": 0.8895389412263918, + "grad_norm": 7.9474592208862305, + "learning_rate": 3.517435097956014e-05, + "loss": 2.8463, + "step": 2861500 + }, + { + "epoch": 0.8896943735068786, + "grad_norm": 12.328620910644531, + "learning_rate": 3.5171760441552026e-05, + "loss": 2.8566, + "step": 2862000 + }, + { + "epoch": 0.8898498057873655, + "grad_norm": 6.681273937225342, + "learning_rate": 3.5169169903543906e-05, + "loss": 2.9426, + "step": 2862500 + }, + { + "epoch": 0.8900052380678524, + "grad_norm": 7.52331018447876, + "learning_rate": 3.516657936553579e-05, + "loss": 2.8766, + "step": 2863000 + }, + { + "epoch": 0.8901606703483392, + "grad_norm": 15.497297286987305, + "learning_rate": 3.516398882752768e-05, + "loss": 2.835, + "step": 2863500 + }, + { + "epoch": 0.8903161026288262, + "grad_norm": 8.885714530944824, + "learning_rate": 3.516139828951957e-05, + "loss": 2.8526, + "step": 2864000 + }, + { + "epoch": 0.8904715349093131, + "grad_norm": 8.362709045410156, + "learning_rate": 3.5158807751511455e-05, + "loss": 2.8185, + "step": 2864500 + }, + { + "epoch": 0.8906269671898, + "grad_norm": 7.395949363708496, + "learning_rate": 3.5156217213503335e-05, + "loss": 2.902, + "step": 2865000 + }, + { + "epoch": 0.8907823994702868, + "grad_norm": 10.84183120727539, + "learning_rate": 3.515362667549522e-05, + "loss": 2.8736, + "step": 2865500 + }, + { + "epoch": 0.8909378317507737, + "grad_norm": 9.383894920349121, + "learning_rate": 3.515103613748711e-05, + "loss": 2.8467, + "step": 2866000 + }, + { + "epoch": 0.8910932640312605, + "grad_norm": 8.462385177612305, + "learning_rate": 3.514844559947899e-05, + "loss": 2.8973, + "step": 2866500 + }, + { + "epoch": 0.8912486963117474, + "grad_norm": 9.838400840759277, + "learning_rate": 3.514585506147088e-05, + "loss": 2.9139, + "step": 2867000 + }, + { + "epoch": 0.8914041285922343, + "grad_norm": 10.312861442565918, + "learning_rate": 3.5143264523462764e-05, + "loss": 2.9069, + "step": 2867500 + }, + { + "epoch": 0.8915595608727211, + "grad_norm": 7.829312801361084, + "learning_rate": 3.5140673985454645e-05, + "loss": 2.9065, + "step": 2868000 + }, + { + "epoch": 0.891714993153208, + "grad_norm": 9.119758605957031, + "learning_rate": 3.513808344744653e-05, + "loss": 2.9143, + "step": 2868500 + }, + { + "epoch": 0.8918704254336949, + "grad_norm": 9.953330039978027, + "learning_rate": 3.513549290943842e-05, + "loss": 2.9034, + "step": 2869000 + }, + { + "epoch": 0.8920258577141817, + "grad_norm": 11.495429039001465, + "learning_rate": 3.5132902371430306e-05, + "loss": 2.9015, + "step": 2869500 + }, + { + "epoch": 0.8921812899946687, + "grad_norm": 10.50674819946289, + "learning_rate": 3.513031183342219e-05, + "loss": 2.9032, + "step": 2870000 + }, + { + "epoch": 0.8923367222751556, + "grad_norm": 7.089749813079834, + "learning_rate": 3.5127721295414074e-05, + "loss": 2.8816, + "step": 2870500 + }, + { + "epoch": 0.8924921545556425, + "grad_norm": 10.257668495178223, + "learning_rate": 3.512513075740596e-05, + "loss": 2.8941, + "step": 2871000 + }, + { + "epoch": 0.8926475868361293, + "grad_norm": 10.037687301635742, + "learning_rate": 3.512254021939785e-05, + "loss": 2.954, + "step": 2871500 + }, + { + "epoch": 0.8928030191166162, + "grad_norm": 24.969697952270508, + "learning_rate": 3.511994968138973e-05, + "loss": 2.8645, + "step": 2872000 + }, + { + "epoch": 0.8929584513971031, + "grad_norm": 8.478296279907227, + "learning_rate": 3.5117359143381615e-05, + "loss": 2.9372, + "step": 2872500 + }, + { + "epoch": 0.8931138836775899, + "grad_norm": 8.720327377319336, + "learning_rate": 3.51147686053735e-05, + "loss": 2.9025, + "step": 2873000 + }, + { + "epoch": 0.8932693159580768, + "grad_norm": 6.189871311187744, + "learning_rate": 3.511217806736539e-05, + "loss": 2.8478, + "step": 2873500 + }, + { + "epoch": 0.8934247482385637, + "grad_norm": 9.676880836486816, + "learning_rate": 3.510958752935728e-05, + "loss": 2.8677, + "step": 2874000 + }, + { + "epoch": 0.8935801805190505, + "grad_norm": 9.60492992401123, + "learning_rate": 3.5106996991349164e-05, + "loss": 2.8942, + "step": 2874500 + }, + { + "epoch": 0.8937356127995374, + "grad_norm": 7.768802165985107, + "learning_rate": 3.5104406453341044e-05, + "loss": 2.908, + "step": 2875000 + }, + { + "epoch": 0.8938910450800243, + "grad_norm": 8.968056678771973, + "learning_rate": 3.510181591533293e-05, + "loss": 2.8954, + "step": 2875500 + }, + { + "epoch": 0.8940464773605112, + "grad_norm": 10.174880027770996, + "learning_rate": 3.509922537732481e-05, + "loss": 2.9142, + "step": 2876000 + }, + { + "epoch": 0.8942019096409981, + "grad_norm": 11.562125205993652, + "learning_rate": 3.50966348393167e-05, + "loss": 2.8924, + "step": 2876500 + }, + { + "epoch": 0.894357341921485, + "grad_norm": 11.169544219970703, + "learning_rate": 3.5094044301308586e-05, + "loss": 2.8751, + "step": 2877000 + }, + { + "epoch": 0.8945127742019718, + "grad_norm": 9.15861988067627, + "learning_rate": 3.5091453763300467e-05, + "loss": 2.8729, + "step": 2877500 + }, + { + "epoch": 0.8946682064824587, + "grad_norm": 10.352339744567871, + "learning_rate": 3.5088863225292354e-05, + "loss": 2.9236, + "step": 2878000 + }, + { + "epoch": 0.8948236387629456, + "grad_norm": 5.729245185852051, + "learning_rate": 3.508627268728424e-05, + "loss": 2.8874, + "step": 2878500 + }, + { + "epoch": 0.8949790710434324, + "grad_norm": 5.393176555633545, + "learning_rate": 3.508368214927613e-05, + "loss": 2.8698, + "step": 2879000 + }, + { + "epoch": 0.8951345033239193, + "grad_norm": 8.736350059509277, + "learning_rate": 3.5081091611268015e-05, + "loss": 2.8804, + "step": 2879500 + }, + { + "epoch": 0.8952899356044062, + "grad_norm": 7.553696632385254, + "learning_rate": 3.50785010732599e-05, + "loss": 2.8812, + "step": 2880000 + }, + { + "epoch": 0.895445367884893, + "grad_norm": 36.91799545288086, + "learning_rate": 3.507591053525178e-05, + "loss": 2.9091, + "step": 2880500 + }, + { + "epoch": 0.8956008001653799, + "grad_norm": 10.299405097961426, + "learning_rate": 3.507331999724367e-05, + "loss": 2.8745, + "step": 2881000 + }, + { + "epoch": 0.8957562324458668, + "grad_norm": 9.039993286132812, + "learning_rate": 3.507072945923556e-05, + "loss": 2.8651, + "step": 2881500 + }, + { + "epoch": 0.8959116647263538, + "grad_norm": 8.291670799255371, + "learning_rate": 3.506813892122744e-05, + "loss": 2.9313, + "step": 2882000 + }, + { + "epoch": 0.8960670970068406, + "grad_norm": 9.717188835144043, + "learning_rate": 3.5065548383219325e-05, + "loss": 2.9564, + "step": 2882500 + }, + { + "epoch": 0.8962225292873275, + "grad_norm": 7.799266815185547, + "learning_rate": 3.506295784521121e-05, + "loss": 2.9005, + "step": 2883000 + }, + { + "epoch": 0.8963779615678144, + "grad_norm": 7.771289348602295, + "learning_rate": 3.50603673072031e-05, + "loss": 2.9318, + "step": 2883500 + }, + { + "epoch": 0.8965333938483012, + "grad_norm": 8.420425415039062, + "learning_rate": 3.5057776769194986e-05, + "loss": 2.8458, + "step": 2884000 + }, + { + "epoch": 0.8966888261287881, + "grad_norm": 17.080310821533203, + "learning_rate": 3.5055186231186866e-05, + "loss": 2.8879, + "step": 2884500 + }, + { + "epoch": 0.896844258409275, + "grad_norm": 10.152031898498535, + "learning_rate": 3.5052595693178754e-05, + "loss": 2.8838, + "step": 2885000 + }, + { + "epoch": 0.8969996906897618, + "grad_norm": 12.342680931091309, + "learning_rate": 3.505000515517064e-05, + "loss": 2.9442, + "step": 2885500 + }, + { + "epoch": 0.8971551229702487, + "grad_norm": 8.982125282287598, + "learning_rate": 3.504741461716252e-05, + "loss": 2.917, + "step": 2886000 + }, + { + "epoch": 0.8973105552507356, + "grad_norm": 10.040512084960938, + "learning_rate": 3.504482407915441e-05, + "loss": 2.9009, + "step": 2886500 + }, + { + "epoch": 0.8974659875312224, + "grad_norm": 9.221822738647461, + "learning_rate": 3.5042233541146295e-05, + "loss": 2.885, + "step": 2887000 + }, + { + "epoch": 0.8976214198117093, + "grad_norm": 6.440109729766846, + "learning_rate": 3.5039643003138176e-05, + "loss": 2.8978, + "step": 2887500 + }, + { + "epoch": 0.8977768520921963, + "grad_norm": 9.428201675415039, + "learning_rate": 3.503705246513006e-05, + "loss": 2.8955, + "step": 2888000 + }, + { + "epoch": 0.8979322843726831, + "grad_norm": 8.104732513427734, + "learning_rate": 3.503446192712195e-05, + "loss": 2.8459, + "step": 2888500 + }, + { + "epoch": 0.89808771665317, + "grad_norm": 10.664054870605469, + "learning_rate": 3.503187138911384e-05, + "loss": 2.9025, + "step": 2889000 + }, + { + "epoch": 0.8982431489336569, + "grad_norm": 16.239717483520508, + "learning_rate": 3.5029280851105724e-05, + "loss": 2.8871, + "step": 2889500 + }, + { + "epoch": 0.8983985812141437, + "grad_norm": 8.79964828491211, + "learning_rate": 3.5026690313097605e-05, + "loss": 2.8889, + "step": 2890000 + }, + { + "epoch": 0.8985540134946306, + "grad_norm": 9.251182556152344, + "learning_rate": 3.502409977508949e-05, + "loss": 2.8763, + "step": 2890500 + }, + { + "epoch": 0.8987094457751175, + "grad_norm": 7.778484344482422, + "learning_rate": 3.502150923708138e-05, + "loss": 2.8965, + "step": 2891000 + }, + { + "epoch": 0.8988648780556043, + "grad_norm": 12.276129722595215, + "learning_rate": 3.501891869907326e-05, + "loss": 2.8935, + "step": 2891500 + }, + { + "epoch": 0.8990203103360912, + "grad_norm": 7.966427326202393, + "learning_rate": 3.501632816106515e-05, + "loss": 2.888, + "step": 2892000 + }, + { + "epoch": 0.8991757426165781, + "grad_norm": 8.415345191955566, + "learning_rate": 3.5013737623057034e-05, + "loss": 2.9196, + "step": 2892500 + }, + { + "epoch": 0.8993311748970649, + "grad_norm": 7.642807960510254, + "learning_rate": 3.501114708504892e-05, + "loss": 2.8975, + "step": 2893000 + }, + { + "epoch": 0.8994866071775518, + "grad_norm": 30.611257553100586, + "learning_rate": 3.500855654704081e-05, + "loss": 2.874, + "step": 2893500 + }, + { + "epoch": 0.8996420394580388, + "grad_norm": 8.851075172424316, + "learning_rate": 3.500596600903269e-05, + "loss": 2.9042, + "step": 2894000 + }, + { + "epoch": 0.8997974717385256, + "grad_norm": 9.773266792297363, + "learning_rate": 3.5003375471024576e-05, + "loss": 2.8968, + "step": 2894500 + }, + { + "epoch": 0.8999529040190125, + "grad_norm": 7.583104610443115, + "learning_rate": 3.500078493301646e-05, + "loss": 2.878, + "step": 2895000 + }, + { + "epoch": 0.9001083362994994, + "grad_norm": 8.870342254638672, + "learning_rate": 3.499819439500834e-05, + "loss": 2.8735, + "step": 2895500 + }, + { + "epoch": 0.9002637685799862, + "grad_norm": 5.7834601402282715, + "learning_rate": 3.499560385700023e-05, + "loss": 2.8935, + "step": 2896000 + }, + { + "epoch": 0.9004192008604731, + "grad_norm": 7.208289623260498, + "learning_rate": 3.499301331899212e-05, + "loss": 2.9099, + "step": 2896500 + }, + { + "epoch": 0.90057463314096, + "grad_norm": 8.801483154296875, + "learning_rate": 3.4990422780984e-05, + "loss": 2.9109, + "step": 2897000 + }, + { + "epoch": 0.9007300654214468, + "grad_norm": 15.521273612976074, + "learning_rate": 3.4987832242975885e-05, + "loss": 2.8282, + "step": 2897500 + }, + { + "epoch": 0.9008854977019337, + "grad_norm": 9.511940956115723, + "learning_rate": 3.498524170496777e-05, + "loss": 2.8648, + "step": 2898000 + }, + { + "epoch": 0.9010409299824206, + "grad_norm": 10.679705619812012, + "learning_rate": 3.498265116695966e-05, + "loss": 2.9018, + "step": 2898500 + }, + { + "epoch": 0.9011963622629074, + "grad_norm": 9.0250244140625, + "learning_rate": 3.4980060628951546e-05, + "loss": 2.9129, + "step": 2899000 + }, + { + "epoch": 0.9013517945433943, + "grad_norm": 10.094432830810547, + "learning_rate": 3.4977470090943434e-05, + "loss": 2.8769, + "step": 2899500 + }, + { + "epoch": 0.9015072268238813, + "grad_norm": 22.885971069335938, + "learning_rate": 3.4974879552935314e-05, + "loss": 2.886, + "step": 2900000 + }, + { + "epoch": 0.9016626591043682, + "grad_norm": 7.944474697113037, + "learning_rate": 3.49722890149272e-05, + "loss": 2.876, + "step": 2900500 + }, + { + "epoch": 0.901818091384855, + "grad_norm": 22.460851669311523, + "learning_rate": 3.496969847691908e-05, + "loss": 2.8787, + "step": 2901000 + }, + { + "epoch": 0.9019735236653419, + "grad_norm": 7.11407470703125, + "learning_rate": 3.496710793891097e-05, + "loss": 2.9569, + "step": 2901500 + }, + { + "epoch": 0.9021289559458288, + "grad_norm": 10.216238975524902, + "learning_rate": 3.4964517400902856e-05, + "loss": 2.8418, + "step": 2902000 + }, + { + "epoch": 0.9022843882263156, + "grad_norm": 16.830917358398438, + "learning_rate": 3.4961926862894736e-05, + "loss": 2.8769, + "step": 2902500 + }, + { + "epoch": 0.9024398205068025, + "grad_norm": 9.58922290802002, + "learning_rate": 3.495933632488663e-05, + "loss": 2.8625, + "step": 2903000 + }, + { + "epoch": 0.9025952527872894, + "grad_norm": 7.610659599304199, + "learning_rate": 3.495674578687852e-05, + "loss": 2.8724, + "step": 2903500 + }, + { + "epoch": 0.9027506850677762, + "grad_norm": 11.729060173034668, + "learning_rate": 3.49541552488704e-05, + "loss": 2.9144, + "step": 2904000 + }, + { + "epoch": 0.9029061173482631, + "grad_norm": 8.324459075927734, + "learning_rate": 3.4951564710862285e-05, + "loss": 2.9263, + "step": 2904500 + }, + { + "epoch": 0.90306154962875, + "grad_norm": 6.200403690338135, + "learning_rate": 3.494897417285417e-05, + "loss": 2.9017, + "step": 2905000 + }, + { + "epoch": 0.9032169819092368, + "grad_norm": 11.084380149841309, + "learning_rate": 3.494638363484605e-05, + "loss": 2.8675, + "step": 2905500 + }, + { + "epoch": 0.9033724141897238, + "grad_norm": 10.102869987487793, + "learning_rate": 3.494379309683794e-05, + "loss": 2.8817, + "step": 2906000 + }, + { + "epoch": 0.9035278464702107, + "grad_norm": 7.792688846588135, + "learning_rate": 3.494120255882982e-05, + "loss": 2.9247, + "step": 2906500 + }, + { + "epoch": 0.9036832787506975, + "grad_norm": 9.25859260559082, + "learning_rate": 3.493861202082171e-05, + "loss": 2.8925, + "step": 2907000 + }, + { + "epoch": 0.9038387110311844, + "grad_norm": 9.612520217895508, + "learning_rate": 3.4936021482813594e-05, + "loss": 2.8363, + "step": 2907500 + }, + { + "epoch": 0.9039941433116713, + "grad_norm": 9.884474754333496, + "learning_rate": 3.493343094480548e-05, + "loss": 2.8868, + "step": 2908000 + }, + { + "epoch": 0.9041495755921581, + "grad_norm": 10.5563383102417, + "learning_rate": 3.493084040679737e-05, + "loss": 2.842, + "step": 2908500 + }, + { + "epoch": 0.904305007872645, + "grad_norm": 9.661972999572754, + "learning_rate": 3.4928249868789256e-05, + "loss": 2.8206, + "step": 2909000 + }, + { + "epoch": 0.9044604401531319, + "grad_norm": 8.07446575164795, + "learning_rate": 3.4925659330781136e-05, + "loss": 2.8924, + "step": 2909500 + }, + { + "epoch": 0.9046158724336187, + "grad_norm": 8.438380241394043, + "learning_rate": 3.492306879277302e-05, + "loss": 2.899, + "step": 2910000 + }, + { + "epoch": 0.9047713047141056, + "grad_norm": 9.711723327636719, + "learning_rate": 3.492047825476491e-05, + "loss": 2.891, + "step": 2910500 + }, + { + "epoch": 0.9049267369945925, + "grad_norm": 8.794220924377441, + "learning_rate": 3.491788771675679e-05, + "loss": 2.861, + "step": 2911000 + }, + { + "epoch": 0.9050821692750793, + "grad_norm": 10.919041633605957, + "learning_rate": 3.491529717874868e-05, + "loss": 2.8865, + "step": 2911500 + }, + { + "epoch": 0.9052376015555663, + "grad_norm": 20.567081451416016, + "learning_rate": 3.491270664074056e-05, + "loss": 2.8573, + "step": 2912000 + }, + { + "epoch": 0.9053930338360532, + "grad_norm": 9.150250434875488, + "learning_rate": 3.4910116102732445e-05, + "loss": 2.888, + "step": 2912500 + }, + { + "epoch": 0.90554846611654, + "grad_norm": 10.504456520080566, + "learning_rate": 3.490752556472434e-05, + "loss": 2.9111, + "step": 2913000 + }, + { + "epoch": 0.9057038983970269, + "grad_norm": 9.052949905395508, + "learning_rate": 3.490493502671622e-05, + "loss": 2.8383, + "step": 2913500 + }, + { + "epoch": 0.9058593306775138, + "grad_norm": 9.183926582336426, + "learning_rate": 3.490234448870811e-05, + "loss": 2.8788, + "step": 2914000 + }, + { + "epoch": 0.9060147629580007, + "grad_norm": 8.033653259277344, + "learning_rate": 3.4899753950699994e-05, + "loss": 2.859, + "step": 2914500 + }, + { + "epoch": 0.9061701952384875, + "grad_norm": 9.370647430419922, + "learning_rate": 3.4897163412691874e-05, + "loss": 2.8782, + "step": 2915000 + }, + { + "epoch": 0.9063256275189744, + "grad_norm": 7.438004493713379, + "learning_rate": 3.489457287468376e-05, + "loss": 2.8936, + "step": 2915500 + }, + { + "epoch": 0.9064810597994613, + "grad_norm": 6.665165424346924, + "learning_rate": 3.489198233667565e-05, + "loss": 2.8747, + "step": 2916000 + }, + { + "epoch": 0.9066364920799481, + "grad_norm": 7.546766757965088, + "learning_rate": 3.488939179866753e-05, + "loss": 2.8914, + "step": 2916500 + }, + { + "epoch": 0.906791924360435, + "grad_norm": 12.714120864868164, + "learning_rate": 3.4886801260659416e-05, + "loss": 2.8768, + "step": 2917000 + }, + { + "epoch": 0.9069473566409219, + "grad_norm": 7.314573764801025, + "learning_rate": 3.48842107226513e-05, + "loss": 2.8948, + "step": 2917500 + }, + { + "epoch": 0.9071027889214088, + "grad_norm": 9.716217041015625, + "learning_rate": 3.488162018464319e-05, + "loss": 2.8277, + "step": 2918000 + }, + { + "epoch": 0.9072582212018957, + "grad_norm": 6.939978122711182, + "learning_rate": 3.487902964663508e-05, + "loss": 2.9384, + "step": 2918500 + }, + { + "epoch": 0.9074136534823826, + "grad_norm": 11.746077537536621, + "learning_rate": 3.487643910862696e-05, + "loss": 2.8477, + "step": 2919000 + }, + { + "epoch": 0.9075690857628694, + "grad_norm": 7.843739032745361, + "learning_rate": 3.4873848570618845e-05, + "loss": 2.9387, + "step": 2919500 + }, + { + "epoch": 0.9077245180433563, + "grad_norm": 10.60203742980957, + "learning_rate": 3.487125803261073e-05, + "loss": 2.8597, + "step": 2920000 + }, + { + "epoch": 0.9078799503238432, + "grad_norm": 9.370147705078125, + "learning_rate": 3.486866749460261e-05, + "loss": 2.8951, + "step": 2920500 + }, + { + "epoch": 0.90803538260433, + "grad_norm": 5.684279441833496, + "learning_rate": 3.48660769565945e-05, + "loss": 2.8714, + "step": 2921000 + }, + { + "epoch": 0.9081908148848169, + "grad_norm": 10.159368515014648, + "learning_rate": 3.486348641858639e-05, + "loss": 2.8114, + "step": 2921500 + }, + { + "epoch": 0.9083462471653038, + "grad_norm": 8.778696060180664, + "learning_rate": 3.486089588057827e-05, + "loss": 2.9471, + "step": 2922000 + }, + { + "epoch": 0.9085016794457906, + "grad_norm": 7.231593608856201, + "learning_rate": 3.4858305342570155e-05, + "loss": 2.8923, + "step": 2922500 + }, + { + "epoch": 0.9086571117262775, + "grad_norm": 11.16109848022461, + "learning_rate": 3.485571480456205e-05, + "loss": 2.8666, + "step": 2923000 + }, + { + "epoch": 0.9088125440067644, + "grad_norm": 7.403845310211182, + "learning_rate": 3.485312426655393e-05, + "loss": 2.8813, + "step": 2923500 + }, + { + "epoch": 0.9089679762872512, + "grad_norm": 10.097569465637207, + "learning_rate": 3.4850533728545816e-05, + "loss": 2.8648, + "step": 2924000 + }, + { + "epoch": 0.9091234085677382, + "grad_norm": 8.096266746520996, + "learning_rate": 3.4847943190537696e-05, + "loss": 2.8582, + "step": 2924500 + }, + { + "epoch": 0.9092788408482251, + "grad_norm": 8.050078392028809, + "learning_rate": 3.4845352652529584e-05, + "loss": 2.8846, + "step": 2925000 + }, + { + "epoch": 0.909434273128712, + "grad_norm": 11.543882369995117, + "learning_rate": 3.484276211452147e-05, + "loss": 2.8816, + "step": 2925500 + }, + { + "epoch": 0.9095897054091988, + "grad_norm": 8.598169326782227, + "learning_rate": 3.484017157651335e-05, + "loss": 2.9006, + "step": 2926000 + }, + { + "epoch": 0.9097451376896857, + "grad_norm": 14.267986297607422, + "learning_rate": 3.483758103850524e-05, + "loss": 2.9005, + "step": 2926500 + }, + { + "epoch": 0.9099005699701725, + "grad_norm": 9.259952545166016, + "learning_rate": 3.4834990500497125e-05, + "loss": 2.887, + "step": 2927000 + }, + { + "epoch": 0.9100560022506594, + "grad_norm": 10.009737014770508, + "learning_rate": 3.483239996248901e-05, + "loss": 2.8734, + "step": 2927500 + }, + { + "epoch": 0.9102114345311463, + "grad_norm": 7.470198154449463, + "learning_rate": 3.48298094244809e-05, + "loss": 2.9335, + "step": 2928000 + }, + { + "epoch": 0.9103668668116331, + "grad_norm": 7.507563591003418, + "learning_rate": 3.482721888647279e-05, + "loss": 2.9114, + "step": 2928500 + }, + { + "epoch": 0.91052229909212, + "grad_norm": 7.307322978973389, + "learning_rate": 3.482462834846467e-05, + "loss": 2.9182, + "step": 2929000 + }, + { + "epoch": 0.9106777313726069, + "grad_norm": 8.51759147644043, + "learning_rate": 3.4822037810456554e-05, + "loss": 2.8534, + "step": 2929500 + }, + { + "epoch": 0.9108331636530937, + "grad_norm": 8.16244888305664, + "learning_rate": 3.481944727244844e-05, + "loss": 2.847, + "step": 2930000 + }, + { + "epoch": 0.9109885959335807, + "grad_norm": 8.89371109008789, + "learning_rate": 3.481685673444032e-05, + "loss": 2.8499, + "step": 2930500 + }, + { + "epoch": 0.9111440282140676, + "grad_norm": 43.91466522216797, + "learning_rate": 3.481426619643221e-05, + "loss": 2.9178, + "step": 2931000 + }, + { + "epoch": 0.9112994604945545, + "grad_norm": 9.332555770874023, + "learning_rate": 3.481167565842409e-05, + "loss": 2.8685, + "step": 2931500 + }, + { + "epoch": 0.9114548927750413, + "grad_norm": 8.039565086364746, + "learning_rate": 3.4809085120415977e-05, + "loss": 2.879, + "step": 2932000 + }, + { + "epoch": 0.9116103250555282, + "grad_norm": 13.274303436279297, + "learning_rate": 3.4806494582407864e-05, + "loss": 2.8717, + "step": 2932500 + }, + { + "epoch": 0.9117657573360151, + "grad_norm": 8.216110229492188, + "learning_rate": 3.480390404439975e-05, + "loss": 2.8722, + "step": 2933000 + }, + { + "epoch": 0.9119211896165019, + "grad_norm": 13.330166816711426, + "learning_rate": 3.480131350639164e-05, + "loss": 2.8922, + "step": 2933500 + }, + { + "epoch": 0.9120766218969888, + "grad_norm": 7.750061988830566, + "learning_rate": 3.4798722968383525e-05, + "loss": 2.8815, + "step": 2934000 + }, + { + "epoch": 0.9122320541774757, + "grad_norm": 10.204545974731445, + "learning_rate": 3.4796132430375406e-05, + "loss": 2.8631, + "step": 2934500 + }, + { + "epoch": 0.9123874864579625, + "grad_norm": 22.947439193725586, + "learning_rate": 3.479354189236729e-05, + "loss": 2.9041, + "step": 2935000 + }, + { + "epoch": 0.9125429187384494, + "grad_norm": 14.126570701599121, + "learning_rate": 3.479095135435918e-05, + "loss": 2.8681, + "step": 2935500 + }, + { + "epoch": 0.9126983510189363, + "grad_norm": 6.628986358642578, + "learning_rate": 3.478836081635106e-05, + "loss": 2.9171, + "step": 2936000 + }, + { + "epoch": 0.9128537832994232, + "grad_norm": 8.619221687316895, + "learning_rate": 3.478577027834295e-05, + "loss": 2.8936, + "step": 2936500 + }, + { + "epoch": 0.9130092155799101, + "grad_norm": 9.900786399841309, + "learning_rate": 3.4783179740334835e-05, + "loss": 2.9119, + "step": 2937000 + }, + { + "epoch": 0.913164647860397, + "grad_norm": 13.058842658996582, + "learning_rate": 3.478058920232672e-05, + "loss": 2.8991, + "step": 2937500 + }, + { + "epoch": 0.9133200801408838, + "grad_norm": 8.819350242614746, + "learning_rate": 3.477799866431861e-05, + "loss": 2.844, + "step": 2938000 + }, + { + "epoch": 0.9134755124213707, + "grad_norm": 6.867435932159424, + "learning_rate": 3.477540812631049e-05, + "loss": 2.8464, + "step": 2938500 + }, + { + "epoch": 0.9136309447018576, + "grad_norm": 5.259130001068115, + "learning_rate": 3.4772817588302376e-05, + "loss": 2.8724, + "step": 2939000 + }, + { + "epoch": 0.9137863769823444, + "grad_norm": 7.597060203552246, + "learning_rate": 3.4770227050294264e-05, + "loss": 2.8758, + "step": 2939500 + }, + { + "epoch": 0.9139418092628313, + "grad_norm": 8.111102104187012, + "learning_rate": 3.4767636512286144e-05, + "loss": 2.8719, + "step": 2940000 + }, + { + "epoch": 0.9140972415433182, + "grad_norm": 9.188836097717285, + "learning_rate": 3.476504597427803e-05, + "loss": 2.8657, + "step": 2940500 + }, + { + "epoch": 0.914252673823805, + "grad_norm": 7.28554630279541, + "learning_rate": 3.476245543626992e-05, + "loss": 2.9251, + "step": 2941000 + }, + { + "epoch": 0.9144081061042919, + "grad_norm": 9.238210678100586, + "learning_rate": 3.47598648982618e-05, + "loss": 2.8681, + "step": 2941500 + }, + { + "epoch": 0.9145635383847788, + "grad_norm": 8.248133659362793, + "learning_rate": 3.4757274360253686e-05, + "loss": 2.8944, + "step": 2942000 + }, + { + "epoch": 0.9147189706652658, + "grad_norm": 8.175444602966309, + "learning_rate": 3.475468382224557e-05, + "loss": 2.896, + "step": 2942500 + }, + { + "epoch": 0.9148744029457526, + "grad_norm": 9.868369102478027, + "learning_rate": 3.475209328423746e-05, + "loss": 2.8857, + "step": 2943000 + }, + { + "epoch": 0.9150298352262395, + "grad_norm": 8.472128868103027, + "learning_rate": 3.474950274622935e-05, + "loss": 2.8525, + "step": 2943500 + }, + { + "epoch": 0.9151852675067264, + "grad_norm": 9.122458457946777, + "learning_rate": 3.474691220822123e-05, + "loss": 2.9035, + "step": 2944000 + }, + { + "epoch": 0.9153406997872132, + "grad_norm": 9.433444023132324, + "learning_rate": 3.4744321670213115e-05, + "loss": 2.8605, + "step": 2944500 + }, + { + "epoch": 0.9154961320677001, + "grad_norm": 11.997209548950195, + "learning_rate": 3.4741731132205e-05, + "loss": 2.8907, + "step": 2945000 + }, + { + "epoch": 0.915651564348187, + "grad_norm": 8.778188705444336, + "learning_rate": 3.473914059419688e-05, + "loss": 2.8508, + "step": 2945500 + }, + { + "epoch": 0.9158069966286738, + "grad_norm": 9.803244590759277, + "learning_rate": 3.473655005618877e-05, + "loss": 2.8841, + "step": 2946000 + }, + { + "epoch": 0.9159624289091607, + "grad_norm": 9.89633846282959, + "learning_rate": 3.4733959518180657e-05, + "loss": 2.8866, + "step": 2946500 + }, + { + "epoch": 0.9161178611896476, + "grad_norm": 8.492581367492676, + "learning_rate": 3.4731368980172544e-05, + "loss": 2.8706, + "step": 2947000 + }, + { + "epoch": 0.9162732934701344, + "grad_norm": 8.984375, + "learning_rate": 3.472877844216443e-05, + "loss": 2.8349, + "step": 2947500 + }, + { + "epoch": 0.9164287257506213, + "grad_norm": 7.694403171539307, + "learning_rate": 3.472618790415632e-05, + "loss": 2.9513, + "step": 2948000 + }, + { + "epoch": 0.9165841580311083, + "grad_norm": 8.735879898071289, + "learning_rate": 3.47235973661482e-05, + "loss": 2.8993, + "step": 2948500 + }, + { + "epoch": 0.9167395903115951, + "grad_norm": 6.792108058929443, + "learning_rate": 3.4721006828140086e-05, + "loss": 2.8819, + "step": 2949000 + }, + { + "epoch": 0.916895022592082, + "grad_norm": 8.361845016479492, + "learning_rate": 3.4718416290131966e-05, + "loss": 2.8665, + "step": 2949500 + }, + { + "epoch": 0.9170504548725689, + "grad_norm": 8.33590030670166, + "learning_rate": 3.471582575212385e-05, + "loss": 2.8711, + "step": 2950000 + }, + { + "epoch": 0.9172058871530557, + "grad_norm": 15.135053634643555, + "learning_rate": 3.471323521411574e-05, + "loss": 2.8883, + "step": 2950500 + }, + { + "epoch": 0.9173613194335426, + "grad_norm": 9.532010078430176, + "learning_rate": 3.471064467610762e-05, + "loss": 2.9184, + "step": 2951000 + }, + { + "epoch": 0.9175167517140295, + "grad_norm": 12.256246566772461, + "learning_rate": 3.470805413809951e-05, + "loss": 2.8821, + "step": 2951500 + }, + { + "epoch": 0.9176721839945163, + "grad_norm": 11.030381202697754, + "learning_rate": 3.4705463600091395e-05, + "loss": 2.9201, + "step": 2952000 + }, + { + "epoch": 0.9178276162750032, + "grad_norm": 9.004386901855469, + "learning_rate": 3.470287306208328e-05, + "loss": 2.8901, + "step": 2952500 + }, + { + "epoch": 0.9179830485554901, + "grad_norm": 8.282744407653809, + "learning_rate": 3.470028252407517e-05, + "loss": 2.8747, + "step": 2953000 + }, + { + "epoch": 0.9181384808359769, + "grad_norm": 9.470744132995605, + "learning_rate": 3.4697691986067056e-05, + "loss": 2.8928, + "step": 2953500 + }, + { + "epoch": 0.9182939131164638, + "grad_norm": 8.56399917602539, + "learning_rate": 3.469510144805894e-05, + "loss": 2.9162, + "step": 2954000 + }, + { + "epoch": 0.9184493453969508, + "grad_norm": 8.315070152282715, + "learning_rate": 3.4692510910050824e-05, + "loss": 2.8977, + "step": 2954500 + }, + { + "epoch": 0.9186047776774376, + "grad_norm": 7.83361291885376, + "learning_rate": 3.4689920372042704e-05, + "loss": 2.9017, + "step": 2955000 + }, + { + "epoch": 0.9187602099579245, + "grad_norm": 7.9420857429504395, + "learning_rate": 3.468732983403459e-05, + "loss": 2.8575, + "step": 2955500 + }, + { + "epoch": 0.9189156422384114, + "grad_norm": 10.005561828613281, + "learning_rate": 3.468473929602648e-05, + "loss": 2.8484, + "step": 2956000 + }, + { + "epoch": 0.9190710745188982, + "grad_norm": 9.021201133728027, + "learning_rate": 3.468214875801836e-05, + "loss": 2.9227, + "step": 2956500 + }, + { + "epoch": 0.9192265067993851, + "grad_norm": 10.86722183227539, + "learning_rate": 3.467955822001025e-05, + "loss": 2.8926, + "step": 2957000 + }, + { + "epoch": 0.919381939079872, + "grad_norm": 43.8709602355957, + "learning_rate": 3.467696768200214e-05, + "loss": 2.8726, + "step": 2957500 + }, + { + "epoch": 0.9195373713603588, + "grad_norm": 7.997913360595703, + "learning_rate": 3.467437714399402e-05, + "loss": 2.833, + "step": 2958000 + }, + { + "epoch": 0.9196928036408457, + "grad_norm": 10.042380332946777, + "learning_rate": 3.467178660598591e-05, + "loss": 2.9512, + "step": 2958500 + }, + { + "epoch": 0.9198482359213326, + "grad_norm": 10.850156784057617, + "learning_rate": 3.4669196067977795e-05, + "loss": 2.8947, + "step": 2959000 + }, + { + "epoch": 0.9200036682018194, + "grad_norm": 9.191359519958496, + "learning_rate": 3.4666605529969675e-05, + "loss": 2.8746, + "step": 2959500 + }, + { + "epoch": 0.9201591004823063, + "grad_norm": 10.557916641235352, + "learning_rate": 3.466401499196156e-05, + "loss": 2.9082, + "step": 2960000 + }, + { + "epoch": 0.9203145327627933, + "grad_norm": 8.118138313293457, + "learning_rate": 3.466142445395344e-05, + "loss": 2.8719, + "step": 2960500 + }, + { + "epoch": 0.9204699650432802, + "grad_norm": 8.130017280578613, + "learning_rate": 3.465883391594533e-05, + "loss": 2.867, + "step": 2961000 + }, + { + "epoch": 0.920625397323767, + "grad_norm": 41.338233947753906, + "learning_rate": 3.465624337793722e-05, + "loss": 2.8515, + "step": 2961500 + }, + { + "epoch": 0.9207808296042539, + "grad_norm": 8.503790855407715, + "learning_rate": 3.4653652839929104e-05, + "loss": 2.8802, + "step": 2962000 + }, + { + "epoch": 0.9209362618847408, + "grad_norm": 8.588586807250977, + "learning_rate": 3.465106230192099e-05, + "loss": 2.8499, + "step": 2962500 + }, + { + "epoch": 0.9210916941652276, + "grad_norm": 9.562521934509277, + "learning_rate": 3.464847176391288e-05, + "loss": 2.8931, + "step": 2963000 + }, + { + "epoch": 0.9212471264457145, + "grad_norm": 6.921484470367432, + "learning_rate": 3.464588122590476e-05, + "loss": 2.8732, + "step": 2963500 + }, + { + "epoch": 0.9214025587262014, + "grad_norm": 7.5541090965271, + "learning_rate": 3.4643290687896646e-05, + "loss": 2.9208, + "step": 2964000 + }, + { + "epoch": 0.9215579910066882, + "grad_norm": 9.420157432556152, + "learning_rate": 3.464070014988853e-05, + "loss": 2.914, + "step": 2964500 + }, + { + "epoch": 0.9217134232871751, + "grad_norm": 11.680061340332031, + "learning_rate": 3.4638109611880414e-05, + "loss": 2.9237, + "step": 2965000 + }, + { + "epoch": 0.921868855567662, + "grad_norm": 11.677590370178223, + "learning_rate": 3.46355190738723e-05, + "loss": 2.8636, + "step": 2965500 + }, + { + "epoch": 0.9220242878481488, + "grad_norm": 9.113822937011719, + "learning_rate": 3.463292853586419e-05, + "loss": 2.8697, + "step": 2966000 + }, + { + "epoch": 0.9221797201286358, + "grad_norm": 9.800934791564941, + "learning_rate": 3.463033799785607e-05, + "loss": 2.8936, + "step": 2966500 + }, + { + "epoch": 0.9223351524091227, + "grad_norm": 10.070195198059082, + "learning_rate": 3.462774745984796e-05, + "loss": 2.855, + "step": 2967000 + }, + { + "epoch": 0.9224905846896095, + "grad_norm": 8.172876358032227, + "learning_rate": 3.462515692183984e-05, + "loss": 2.9039, + "step": 2967500 + }, + { + "epoch": 0.9226460169700964, + "grad_norm": 8.111671447753906, + "learning_rate": 3.462256638383173e-05, + "loss": 2.9052, + "step": 2968000 + }, + { + "epoch": 0.9228014492505833, + "grad_norm": 7.662586688995361, + "learning_rate": 3.461997584582362e-05, + "loss": 2.8793, + "step": 2968500 + }, + { + "epoch": 0.9229568815310701, + "grad_norm": 17.222389221191406, + "learning_rate": 3.46173853078155e-05, + "loss": 2.9077, + "step": 2969000 + }, + { + "epoch": 0.923112313811557, + "grad_norm": 8.271260261535645, + "learning_rate": 3.4614794769807384e-05, + "loss": 2.9526, + "step": 2969500 + }, + { + "epoch": 0.9232677460920439, + "grad_norm": 8.696680068969727, + "learning_rate": 3.461220423179927e-05, + "loss": 2.8643, + "step": 2970000 + }, + { + "epoch": 0.9234231783725307, + "grad_norm": 9.14870834350586, + "learning_rate": 3.460961369379115e-05, + "loss": 2.8946, + "step": 2970500 + }, + { + "epoch": 0.9235786106530176, + "grad_norm": 16.417463302612305, + "learning_rate": 3.460702315578304e-05, + "loss": 2.8954, + "step": 2971000 + }, + { + "epoch": 0.9237340429335045, + "grad_norm": 31.585216522216797, + "learning_rate": 3.4604432617774926e-05, + "loss": 2.8801, + "step": 2971500 + }, + { + "epoch": 0.9238894752139913, + "grad_norm": 11.411246299743652, + "learning_rate": 3.460184207976681e-05, + "loss": 2.8769, + "step": 2972000 + }, + { + "epoch": 0.9240449074944783, + "grad_norm": 8.490752220153809, + "learning_rate": 3.45992515417587e-05, + "loss": 2.896, + "step": 2972500 + }, + { + "epoch": 0.9242003397749652, + "grad_norm": 9.013860702514648, + "learning_rate": 3.459666100375058e-05, + "loss": 2.8077, + "step": 2973000 + }, + { + "epoch": 0.924355772055452, + "grad_norm": 7.1518144607543945, + "learning_rate": 3.459407046574247e-05, + "loss": 2.8557, + "step": 2973500 + }, + { + "epoch": 0.9245112043359389, + "grad_norm": 8.155781745910645, + "learning_rate": 3.4591479927734355e-05, + "loss": 2.8869, + "step": 2974000 + }, + { + "epoch": 0.9246666366164258, + "grad_norm": 25.509084701538086, + "learning_rate": 3.4588889389726236e-05, + "loss": 2.8643, + "step": 2974500 + }, + { + "epoch": 0.9248220688969127, + "grad_norm": 10.995949745178223, + "learning_rate": 3.458629885171812e-05, + "loss": 2.8844, + "step": 2975000 + }, + { + "epoch": 0.9249775011773995, + "grad_norm": 10.737728118896484, + "learning_rate": 3.458370831371001e-05, + "loss": 2.9157, + "step": 2975500 + }, + { + "epoch": 0.9251329334578864, + "grad_norm": 9.563267707824707, + "learning_rate": 3.458111777570189e-05, + "loss": 2.8711, + "step": 2976000 + }, + { + "epoch": 0.9252883657383733, + "grad_norm": 8.522640228271484, + "learning_rate": 3.4578527237693784e-05, + "loss": 2.8932, + "step": 2976500 + }, + { + "epoch": 0.9254437980188601, + "grad_norm": 10.865288734436035, + "learning_rate": 3.457593669968567e-05, + "loss": 2.8744, + "step": 2977000 + }, + { + "epoch": 0.925599230299347, + "grad_norm": 12.544992446899414, + "learning_rate": 3.457334616167755e-05, + "loss": 2.8941, + "step": 2977500 + }, + { + "epoch": 0.9257546625798339, + "grad_norm": 8.171772003173828, + "learning_rate": 3.457075562366944e-05, + "loss": 2.887, + "step": 2978000 + }, + { + "epoch": 0.9259100948603208, + "grad_norm": 7.64500093460083, + "learning_rate": 3.456816508566132e-05, + "loss": 2.8401, + "step": 2978500 + }, + { + "epoch": 0.9260655271408077, + "grad_norm": 7.428579807281494, + "learning_rate": 3.4565574547653206e-05, + "loss": 2.8992, + "step": 2979000 + }, + { + "epoch": 0.9262209594212946, + "grad_norm": 9.515082359313965, + "learning_rate": 3.4562984009645094e-05, + "loss": 2.9297, + "step": 2979500 + }, + { + "epoch": 0.9263763917017814, + "grad_norm": 11.040670394897461, + "learning_rate": 3.4560393471636974e-05, + "loss": 2.8798, + "step": 2980000 + }, + { + "epoch": 0.9265318239822683, + "grad_norm": 12.593046188354492, + "learning_rate": 3.455780293362886e-05, + "loss": 2.8708, + "step": 2980500 + }, + { + "epoch": 0.9266872562627552, + "grad_norm": 7.900697231292725, + "learning_rate": 3.455521239562075e-05, + "loss": 2.9034, + "step": 2981000 + }, + { + "epoch": 0.926842688543242, + "grad_norm": 8.205558776855469, + "learning_rate": 3.4552621857612635e-05, + "loss": 2.8876, + "step": 2981500 + }, + { + "epoch": 0.9269981208237289, + "grad_norm": 11.605668067932129, + "learning_rate": 3.455003131960452e-05, + "loss": 2.8811, + "step": 2982000 + }, + { + "epoch": 0.9271535531042158, + "grad_norm": 7.3720879554748535, + "learning_rate": 3.454744078159641e-05, + "loss": 2.864, + "step": 2982500 + }, + { + "epoch": 0.9273089853847026, + "grad_norm": 9.153728485107422, + "learning_rate": 3.454485024358829e-05, + "loss": 2.9131, + "step": 2983000 + }, + { + "epoch": 0.9274644176651895, + "grad_norm": 10.498189926147461, + "learning_rate": 3.454225970558018e-05, + "loss": 2.9111, + "step": 2983500 + }, + { + "epoch": 0.9276198499456764, + "grad_norm": 12.558444023132324, + "learning_rate": 3.4539669167572064e-05, + "loss": 2.8667, + "step": 2984000 + }, + { + "epoch": 0.9277752822261633, + "grad_norm": 8.245563507080078, + "learning_rate": 3.4537078629563945e-05, + "loss": 2.882, + "step": 2984500 + }, + { + "epoch": 0.9279307145066502, + "grad_norm": 8.270671844482422, + "learning_rate": 3.453448809155583e-05, + "loss": 2.8934, + "step": 2985000 + }, + { + "epoch": 0.9280861467871371, + "grad_norm": 8.438468933105469, + "learning_rate": 3.453189755354771e-05, + "loss": 2.9276, + "step": 2985500 + }, + { + "epoch": 0.928241579067624, + "grad_norm": 9.677546501159668, + "learning_rate": 3.45293070155396e-05, + "loss": 2.8757, + "step": 2986000 + }, + { + "epoch": 0.9283970113481108, + "grad_norm": 7.591219902038574, + "learning_rate": 3.452671647753149e-05, + "loss": 2.9037, + "step": 2986500 + }, + { + "epoch": 0.9285524436285977, + "grad_norm": 5.107989311218262, + "learning_rate": 3.4524125939523374e-05, + "loss": 2.8956, + "step": 2987000 + }, + { + "epoch": 0.9287078759090845, + "grad_norm": 12.642374992370605, + "learning_rate": 3.452153540151526e-05, + "loss": 2.8695, + "step": 2987500 + }, + { + "epoch": 0.9288633081895714, + "grad_norm": 6.696290016174316, + "learning_rate": 3.451894486350715e-05, + "loss": 2.8952, + "step": 2988000 + }, + { + "epoch": 0.9290187404700583, + "grad_norm": 8.955946922302246, + "learning_rate": 3.451635432549903e-05, + "loss": 2.9243, + "step": 2988500 + }, + { + "epoch": 0.9291741727505451, + "grad_norm": 5.683178424835205, + "learning_rate": 3.4513763787490916e-05, + "loss": 2.8663, + "step": 2989000 + }, + { + "epoch": 0.929329605031032, + "grad_norm": 8.353137969970703, + "learning_rate": 3.45111732494828e-05, + "loss": 2.9069, + "step": 2989500 + }, + { + "epoch": 0.9294850373115189, + "grad_norm": 9.614656448364258, + "learning_rate": 3.450858271147468e-05, + "loss": 2.8829, + "step": 2990000 + }, + { + "epoch": 0.9296404695920059, + "grad_norm": 7.8414106369018555, + "learning_rate": 3.450599217346657e-05, + "loss": 2.8959, + "step": 2990500 + }, + { + "epoch": 0.9297959018724927, + "grad_norm": 7.818763256072998, + "learning_rate": 3.450340163545846e-05, + "loss": 2.8937, + "step": 2991000 + }, + { + "epoch": 0.9299513341529796, + "grad_norm": 10.307514190673828, + "learning_rate": 3.4500811097450345e-05, + "loss": 2.8671, + "step": 2991500 + }, + { + "epoch": 0.9301067664334665, + "grad_norm": 20.40949821472168, + "learning_rate": 3.449822055944223e-05, + "loss": 2.8539, + "step": 2992000 + }, + { + "epoch": 0.9302621987139533, + "grad_norm": 11.547018051147461, + "learning_rate": 3.449563002143411e-05, + "loss": 2.8703, + "step": 2992500 + }, + { + "epoch": 0.9304176309944402, + "grad_norm": 9.128541946411133, + "learning_rate": 3.4493039483426e-05, + "loss": 2.8768, + "step": 2993000 + }, + { + "epoch": 0.9305730632749271, + "grad_norm": 9.301417350769043, + "learning_rate": 3.4490448945417886e-05, + "loss": 2.8823, + "step": 2993500 + }, + { + "epoch": 0.9307284955554139, + "grad_norm": 9.156929969787598, + "learning_rate": 3.448785840740977e-05, + "loss": 2.8874, + "step": 2994000 + }, + { + "epoch": 0.9308839278359008, + "grad_norm": 8.325342178344727, + "learning_rate": 3.4485267869401654e-05, + "loss": 2.9104, + "step": 2994500 + }, + { + "epoch": 0.9310393601163877, + "grad_norm": 7.9314866065979, + "learning_rate": 3.448267733139354e-05, + "loss": 2.8384, + "step": 2995000 + }, + { + "epoch": 0.9311947923968745, + "grad_norm": 8.72569751739502, + "learning_rate": 3.448008679338542e-05, + "loss": 2.8795, + "step": 2995500 + }, + { + "epoch": 0.9313502246773614, + "grad_norm": 14.14862060546875, + "learning_rate": 3.447749625537731e-05, + "loss": 2.8768, + "step": 2996000 + }, + { + "epoch": 0.9315056569578484, + "grad_norm": 11.30094051361084, + "learning_rate": 3.4474905717369196e-05, + "loss": 2.9356, + "step": 2996500 + }, + { + "epoch": 0.9316610892383352, + "grad_norm": 7.870829105377197, + "learning_rate": 3.447231517936108e-05, + "loss": 2.884, + "step": 2997000 + }, + { + "epoch": 0.9318165215188221, + "grad_norm": 12.501680374145508, + "learning_rate": 3.446972464135297e-05, + "loss": 2.9117, + "step": 2997500 + }, + { + "epoch": 0.931971953799309, + "grad_norm": 25.085172653198242, + "learning_rate": 3.446713410334485e-05, + "loss": 2.8423, + "step": 2998000 + }, + { + "epoch": 0.9321273860797958, + "grad_norm": 7.890192031860352, + "learning_rate": 3.446454356533674e-05, + "loss": 2.8505, + "step": 2998500 + }, + { + "epoch": 0.9322828183602827, + "grad_norm": 11.415363311767578, + "learning_rate": 3.4461953027328625e-05, + "loss": 2.854, + "step": 2999000 + }, + { + "epoch": 0.9324382506407696, + "grad_norm": 9.831151008605957, + "learning_rate": 3.4459362489320505e-05, + "loss": 2.8727, + "step": 2999500 + }, + { + "epoch": 0.9325936829212564, + "grad_norm": 11.22499942779541, + "learning_rate": 3.445677195131239e-05, + "loss": 2.8871, + "step": 3000000 + }, + { + "epoch": 0.9327491152017433, + "grad_norm": 6.666486740112305, + "learning_rate": 3.445418141330428e-05, + "loss": 2.8528, + "step": 3000500 + }, + { + "epoch": 0.9329045474822302, + "grad_norm": 15.420902252197266, + "learning_rate": 3.4451590875296167e-05, + "loss": 2.9269, + "step": 3001000 + }, + { + "epoch": 0.933059979762717, + "grad_norm": 8.150522232055664, + "learning_rate": 3.4449000337288054e-05, + "loss": 2.8688, + "step": 3001500 + }, + { + "epoch": 0.9332154120432039, + "grad_norm": 8.821703910827637, + "learning_rate": 3.444640979927994e-05, + "loss": 2.851, + "step": 3002000 + }, + { + "epoch": 0.9333708443236909, + "grad_norm": 9.582864761352539, + "learning_rate": 3.444381926127182e-05, + "loss": 2.8827, + "step": 3002500 + }, + { + "epoch": 0.9335262766041778, + "grad_norm": 7.244638442993164, + "learning_rate": 3.444122872326371e-05, + "loss": 2.8678, + "step": 3003000 + }, + { + "epoch": 0.9336817088846646, + "grad_norm": 8.629719734191895, + "learning_rate": 3.443863818525559e-05, + "loss": 2.8841, + "step": 3003500 + }, + { + "epoch": 0.9338371411651515, + "grad_norm": 7.386870384216309, + "learning_rate": 3.4436047647247476e-05, + "loss": 2.8859, + "step": 3004000 + }, + { + "epoch": 0.9339925734456384, + "grad_norm": 8.183701515197754, + "learning_rate": 3.443345710923936e-05, + "loss": 2.8819, + "step": 3004500 + }, + { + "epoch": 0.9341480057261252, + "grad_norm": 7.988589286804199, + "learning_rate": 3.4430866571231243e-05, + "loss": 2.9026, + "step": 3005000 + }, + { + "epoch": 0.9343034380066121, + "grad_norm": 9.844691276550293, + "learning_rate": 3.442827603322313e-05, + "loss": 2.9124, + "step": 3005500 + }, + { + "epoch": 0.934458870287099, + "grad_norm": 9.35197639465332, + "learning_rate": 3.442568549521502e-05, + "loss": 2.9091, + "step": 3006000 + }, + { + "epoch": 0.9346143025675858, + "grad_norm": 9.485770225524902, + "learning_rate": 3.4423094957206905e-05, + "loss": 2.8522, + "step": 3006500 + }, + { + "epoch": 0.9347697348480727, + "grad_norm": 10.224061965942383, + "learning_rate": 3.442050441919879e-05, + "loss": 2.8332, + "step": 3007000 + }, + { + "epoch": 0.9349251671285596, + "grad_norm": 8.036727905273438, + "learning_rate": 3.441791388119068e-05, + "loss": 2.8572, + "step": 3007500 + }, + { + "epoch": 0.9350805994090464, + "grad_norm": 10.02910041809082, + "learning_rate": 3.441532334318256e-05, + "loss": 2.9001, + "step": 3008000 + }, + { + "epoch": 0.9352360316895334, + "grad_norm": 8.700174331665039, + "learning_rate": 3.441273280517445e-05, + "loss": 2.8918, + "step": 3008500 + }, + { + "epoch": 0.9353914639700203, + "grad_norm": 7.494165420532227, + "learning_rate": 3.441014226716633e-05, + "loss": 2.8129, + "step": 3009000 + }, + { + "epoch": 0.9355468962505071, + "grad_norm": 9.447959899902344, + "learning_rate": 3.4407551729158214e-05, + "loss": 2.8814, + "step": 3009500 + }, + { + "epoch": 0.935702328530994, + "grad_norm": 8.47057819366455, + "learning_rate": 3.44049611911501e-05, + "loss": 2.8683, + "step": 3010000 + }, + { + "epoch": 0.9358577608114809, + "grad_norm": 10.695176124572754, + "learning_rate": 3.440237065314199e-05, + "loss": 2.8981, + "step": 3010500 + }, + { + "epoch": 0.9360131930919677, + "grad_norm": 10.485987663269043, + "learning_rate": 3.4399780115133876e-05, + "loss": 2.8518, + "step": 3011000 + }, + { + "epoch": 0.9361686253724546, + "grad_norm": 7.675967216491699, + "learning_rate": 3.439718957712576e-05, + "loss": 2.8868, + "step": 3011500 + }, + { + "epoch": 0.9363240576529415, + "grad_norm": 8.95802116394043, + "learning_rate": 3.439459903911764e-05, + "loss": 2.9153, + "step": 3012000 + }, + { + "epoch": 0.9364794899334283, + "grad_norm": 9.09560489654541, + "learning_rate": 3.439200850110953e-05, + "loss": 2.9074, + "step": 3012500 + }, + { + "epoch": 0.9366349222139152, + "grad_norm": 8.181585311889648, + "learning_rate": 3.438941796310142e-05, + "loss": 2.853, + "step": 3013000 + }, + { + "epoch": 0.9367903544944021, + "grad_norm": 8.680970191955566, + "learning_rate": 3.43868274250933e-05, + "loss": 2.9117, + "step": 3013500 + }, + { + "epoch": 0.9369457867748889, + "grad_norm": 7.742905616760254, + "learning_rate": 3.4384236887085185e-05, + "loss": 2.9249, + "step": 3014000 + }, + { + "epoch": 0.9371012190553759, + "grad_norm": 10.527630805969238, + "learning_rate": 3.4381646349077066e-05, + "loss": 2.8927, + "step": 3014500 + }, + { + "epoch": 0.9372566513358628, + "grad_norm": 9.170111656188965, + "learning_rate": 3.437905581106895e-05, + "loss": 2.8503, + "step": 3015000 + }, + { + "epoch": 0.9374120836163496, + "grad_norm": 9.905467987060547, + "learning_rate": 3.437646527306084e-05, + "loss": 2.8235, + "step": 3015500 + }, + { + "epoch": 0.9375675158968365, + "grad_norm": 8.779318809509277, + "learning_rate": 3.437387473505273e-05, + "loss": 2.8605, + "step": 3016000 + }, + { + "epoch": 0.9377229481773234, + "grad_norm": 12.479086875915527, + "learning_rate": 3.4371284197044614e-05, + "loss": 2.9013, + "step": 3016500 + }, + { + "epoch": 0.9378783804578102, + "grad_norm": 8.330708503723145, + "learning_rate": 3.43686936590365e-05, + "loss": 2.9181, + "step": 3017000 + }, + { + "epoch": 0.9380338127382971, + "grad_norm": 10.316177368164062, + "learning_rate": 3.436610312102838e-05, + "loss": 2.8945, + "step": 3017500 + }, + { + "epoch": 0.938189245018784, + "grad_norm": 10.433966636657715, + "learning_rate": 3.436351258302027e-05, + "loss": 2.8901, + "step": 3018000 + }, + { + "epoch": 0.9383446772992708, + "grad_norm": 10.348794937133789, + "learning_rate": 3.4360922045012156e-05, + "loss": 2.8732, + "step": 3018500 + }, + { + "epoch": 0.9385001095797577, + "grad_norm": 9.053220748901367, + "learning_rate": 3.4358331507004036e-05, + "loss": 2.8601, + "step": 3019000 + }, + { + "epoch": 0.9386555418602446, + "grad_norm": 7.401663780212402, + "learning_rate": 3.4355740968995923e-05, + "loss": 2.9063, + "step": 3019500 + }, + { + "epoch": 0.9388109741407314, + "grad_norm": 10.625239372253418, + "learning_rate": 3.435315043098781e-05, + "loss": 2.8692, + "step": 3020000 + }, + { + "epoch": 0.9389664064212184, + "grad_norm": 7.543430328369141, + "learning_rate": 3.43505598929797e-05, + "loss": 2.8757, + "step": 3020500 + }, + { + "epoch": 0.9391218387017053, + "grad_norm": 10.246623992919922, + "learning_rate": 3.4347969354971585e-05, + "loss": 2.8995, + "step": 3021000 + }, + { + "epoch": 0.9392772709821922, + "grad_norm": 9.599881172180176, + "learning_rate": 3.4345378816963465e-05, + "loss": 2.8968, + "step": 3021500 + }, + { + "epoch": 0.939432703262679, + "grad_norm": 8.139756202697754, + "learning_rate": 3.434278827895535e-05, + "loss": 2.9153, + "step": 3022000 + }, + { + "epoch": 0.9395881355431659, + "grad_norm": 7.815672874450684, + "learning_rate": 3.434019774094724e-05, + "loss": 2.9306, + "step": 3022500 + }, + { + "epoch": 0.9397435678236528, + "grad_norm": 7.617238521575928, + "learning_rate": 3.433760720293912e-05, + "loss": 2.9163, + "step": 3023000 + }, + { + "epoch": 0.9398990001041396, + "grad_norm": 8.325281143188477, + "learning_rate": 3.433501666493101e-05, + "loss": 2.8397, + "step": 3023500 + }, + { + "epoch": 0.9400544323846265, + "grad_norm": 9.098665237426758, + "learning_rate": 3.4332426126922894e-05, + "loss": 2.8782, + "step": 3024000 + }, + { + "epoch": 0.9402098646651134, + "grad_norm": 8.562333106994629, + "learning_rate": 3.4329835588914775e-05, + "loss": 2.8661, + "step": 3024500 + }, + { + "epoch": 0.9403652969456002, + "grad_norm": 7.151117324829102, + "learning_rate": 3.432724505090666e-05, + "loss": 2.8663, + "step": 3025000 + }, + { + "epoch": 0.9405207292260871, + "grad_norm": 7.009247779846191, + "learning_rate": 3.432465451289855e-05, + "loss": 2.8997, + "step": 3025500 + }, + { + "epoch": 0.940676161506574, + "grad_norm": 18.475576400756836, + "learning_rate": 3.4322063974890436e-05, + "loss": 2.8579, + "step": 3026000 + }, + { + "epoch": 0.9408315937870609, + "grad_norm": 8.486063003540039, + "learning_rate": 3.431947343688232e-05, + "loss": 2.8848, + "step": 3026500 + }, + { + "epoch": 0.9409870260675478, + "grad_norm": 15.179580688476562, + "learning_rate": 3.4316882898874204e-05, + "loss": 2.8228, + "step": 3027000 + }, + { + "epoch": 0.9411424583480347, + "grad_norm": 11.466093063354492, + "learning_rate": 3.431429236086609e-05, + "loss": 2.8584, + "step": 3027500 + }, + { + "epoch": 0.9412978906285215, + "grad_norm": 9.262569427490234, + "learning_rate": 3.431170182285798e-05, + "loss": 2.8946, + "step": 3028000 + }, + { + "epoch": 0.9414533229090084, + "grad_norm": 8.322342872619629, + "learning_rate": 3.430911128484986e-05, + "loss": 2.8681, + "step": 3028500 + }, + { + "epoch": 0.9416087551894953, + "grad_norm": 8.100214004516602, + "learning_rate": 3.4306520746841746e-05, + "loss": 2.8829, + "step": 3029000 + }, + { + "epoch": 0.9417641874699821, + "grad_norm": 12.680253982543945, + "learning_rate": 3.430393020883363e-05, + "loss": 2.8979, + "step": 3029500 + }, + { + "epoch": 0.941919619750469, + "grad_norm": 13.622167587280273, + "learning_rate": 3.430133967082551e-05, + "loss": 2.8577, + "step": 3030000 + }, + { + "epoch": 0.9420750520309559, + "grad_norm": 12.246444702148438, + "learning_rate": 3.429874913281741e-05, + "loss": 2.9021, + "step": 3030500 + }, + { + "epoch": 0.9422304843114427, + "grad_norm": 7.685930252075195, + "learning_rate": 3.4296158594809294e-05, + "loss": 2.837, + "step": 3031000 + }, + { + "epoch": 0.9423859165919296, + "grad_norm": 9.464300155639648, + "learning_rate": 3.4293568056801175e-05, + "loss": 2.9346, + "step": 3031500 + }, + { + "epoch": 0.9425413488724165, + "grad_norm": 8.601617813110352, + "learning_rate": 3.429097751879306e-05, + "loss": 2.8669, + "step": 3032000 + }, + { + "epoch": 0.9426967811529035, + "grad_norm": 8.299190521240234, + "learning_rate": 3.428838698078494e-05, + "loss": 2.8555, + "step": 3032500 + }, + { + "epoch": 0.9428522134333903, + "grad_norm": 10.858049392700195, + "learning_rate": 3.428579644277683e-05, + "loss": 2.8734, + "step": 3033000 + }, + { + "epoch": 0.9430076457138772, + "grad_norm": 10.046987533569336, + "learning_rate": 3.4283205904768716e-05, + "loss": 2.8747, + "step": 3033500 + }, + { + "epoch": 0.943163077994364, + "grad_norm": 9.65788459777832, + "learning_rate": 3.42806153667606e-05, + "loss": 2.8619, + "step": 3034000 + }, + { + "epoch": 0.9433185102748509, + "grad_norm": 13.53073501586914, + "learning_rate": 3.4278024828752484e-05, + "loss": 2.8919, + "step": 3034500 + }, + { + "epoch": 0.9434739425553378, + "grad_norm": 10.383671760559082, + "learning_rate": 3.427543429074437e-05, + "loss": 2.8678, + "step": 3035000 + }, + { + "epoch": 0.9436293748358247, + "grad_norm": 6.2898125648498535, + "learning_rate": 3.427284375273626e-05, + "loss": 2.8853, + "step": 3035500 + }, + { + "epoch": 0.9437848071163115, + "grad_norm": 13.87948226928711, + "learning_rate": 3.4270253214728145e-05, + "loss": 2.9003, + "step": 3036000 + }, + { + "epoch": 0.9439402393967984, + "grad_norm": 8.254533767700195, + "learning_rate": 3.426766267672003e-05, + "loss": 2.8635, + "step": 3036500 + }, + { + "epoch": 0.9440956716772853, + "grad_norm": 6.819817066192627, + "learning_rate": 3.426507213871191e-05, + "loss": 2.9418, + "step": 3037000 + }, + { + "epoch": 0.9442511039577721, + "grad_norm": 9.777987480163574, + "learning_rate": 3.42624816007038e-05, + "loss": 2.855, + "step": 3037500 + }, + { + "epoch": 0.944406536238259, + "grad_norm": 8.68952751159668, + "learning_rate": 3.425989106269569e-05, + "loss": 2.9635, + "step": 3038000 + }, + { + "epoch": 0.944561968518746, + "grad_norm": 8.865028381347656, + "learning_rate": 3.425730052468757e-05, + "loss": 2.8651, + "step": 3038500 + }, + { + "epoch": 0.9447174007992328, + "grad_norm": 10.105271339416504, + "learning_rate": 3.4254709986679455e-05, + "loss": 2.8721, + "step": 3039000 + }, + { + "epoch": 0.9448728330797197, + "grad_norm": 8.773923873901367, + "learning_rate": 3.4252119448671335e-05, + "loss": 2.8731, + "step": 3039500 + }, + { + "epoch": 0.9450282653602066, + "grad_norm": 7.803600311279297, + "learning_rate": 3.424952891066322e-05, + "loss": 2.8812, + "step": 3040000 + }, + { + "epoch": 0.9451836976406934, + "grad_norm": 7.169610023498535, + "learning_rate": 3.4246938372655116e-05, + "loss": 2.8614, + "step": 3040500 + }, + { + "epoch": 0.9453391299211803, + "grad_norm": 8.8060941696167, + "learning_rate": 3.4244347834646997e-05, + "loss": 2.8955, + "step": 3041000 + }, + { + "epoch": 0.9454945622016672, + "grad_norm": 9.871542930603027, + "learning_rate": 3.4241757296638884e-05, + "loss": 2.9256, + "step": 3041500 + }, + { + "epoch": 0.945649994482154, + "grad_norm": 7.49997615814209, + "learning_rate": 3.423916675863077e-05, + "loss": 2.9028, + "step": 3042000 + }, + { + "epoch": 0.9458054267626409, + "grad_norm": 9.15575122833252, + "learning_rate": 3.423657622062265e-05, + "loss": 2.8652, + "step": 3042500 + }, + { + "epoch": 0.9459608590431278, + "grad_norm": 10.09560489654541, + "learning_rate": 3.423398568261454e-05, + "loss": 2.8826, + "step": 3043000 + }, + { + "epoch": 0.9461162913236146, + "grad_norm": 7.488994598388672, + "learning_rate": 3.4231395144606426e-05, + "loss": 2.8816, + "step": 3043500 + }, + { + "epoch": 0.9462717236041015, + "grad_norm": 12.606825828552246, + "learning_rate": 3.4228804606598306e-05, + "loss": 2.8587, + "step": 3044000 + }, + { + "epoch": 0.9464271558845885, + "grad_norm": 7.342442035675049, + "learning_rate": 3.422621406859019e-05, + "loss": 2.9092, + "step": 3044500 + }, + { + "epoch": 0.9465825881650753, + "grad_norm": 6.644742965698242, + "learning_rate": 3.422362353058208e-05, + "loss": 2.8911, + "step": 3045000 + }, + { + "epoch": 0.9467380204455622, + "grad_norm": 9.949429512023926, + "learning_rate": 3.422103299257397e-05, + "loss": 2.8809, + "step": 3045500 + }, + { + "epoch": 0.9468934527260491, + "grad_norm": 8.269856452941895, + "learning_rate": 3.4218442454565855e-05, + "loss": 2.8926, + "step": 3046000 + }, + { + "epoch": 0.947048885006536, + "grad_norm": 9.044086456298828, + "learning_rate": 3.4215851916557735e-05, + "loss": 2.91, + "step": 3046500 + }, + { + "epoch": 0.9472043172870228, + "grad_norm": 12.261157989501953, + "learning_rate": 3.421326137854962e-05, + "loss": 2.8838, + "step": 3047000 + }, + { + "epoch": 0.9473597495675097, + "grad_norm": 9.789937019348145, + "learning_rate": 3.421067084054151e-05, + "loss": 2.8911, + "step": 3047500 + }, + { + "epoch": 0.9475151818479965, + "grad_norm": 8.530821800231934, + "learning_rate": 3.420808030253339e-05, + "loss": 2.888, + "step": 3048000 + }, + { + "epoch": 0.9476706141284834, + "grad_norm": 9.39022159576416, + "learning_rate": 3.420548976452528e-05, + "loss": 2.9056, + "step": 3048500 + }, + { + "epoch": 0.9478260464089703, + "grad_norm": 8.525893211364746, + "learning_rate": 3.4202899226517164e-05, + "loss": 2.8841, + "step": 3049000 + }, + { + "epoch": 0.9479814786894571, + "grad_norm": 9.261918067932129, + "learning_rate": 3.4200308688509044e-05, + "loss": 2.8666, + "step": 3049500 + }, + { + "epoch": 0.948136910969944, + "grad_norm": 9.783754348754883, + "learning_rate": 3.419771815050093e-05, + "loss": 2.8662, + "step": 3050000 + }, + { + "epoch": 0.948292343250431, + "grad_norm": 8.51305866241455, + "learning_rate": 3.419512761249282e-05, + "loss": 2.8361, + "step": 3050500 + }, + { + "epoch": 0.9484477755309179, + "grad_norm": 7.589217662811279, + "learning_rate": 3.4192537074484706e-05, + "loss": 2.8801, + "step": 3051000 + }, + { + "epoch": 0.9486032078114047, + "grad_norm": 9.739825248718262, + "learning_rate": 3.418994653647659e-05, + "loss": 2.8983, + "step": 3051500 + }, + { + "epoch": 0.9487586400918916, + "grad_norm": 12.884063720703125, + "learning_rate": 3.418735599846847e-05, + "loss": 2.8481, + "step": 3052000 + }, + { + "epoch": 0.9489140723723785, + "grad_norm": 7.894681453704834, + "learning_rate": 3.418476546046036e-05, + "loss": 2.965, + "step": 3052500 + }, + { + "epoch": 0.9490695046528653, + "grad_norm": 8.840742111206055, + "learning_rate": 3.418217492245225e-05, + "loss": 2.9191, + "step": 3053000 + }, + { + "epoch": 0.9492249369333522, + "grad_norm": 9.257515907287598, + "learning_rate": 3.417958438444413e-05, + "loss": 2.9214, + "step": 3053500 + }, + { + "epoch": 0.9493803692138391, + "grad_norm": 7.957951545715332, + "learning_rate": 3.4176993846436015e-05, + "loss": 2.8927, + "step": 3054000 + }, + { + "epoch": 0.9495358014943259, + "grad_norm": 9.471379280090332, + "learning_rate": 3.41744033084279e-05, + "loss": 2.8999, + "step": 3054500 + }, + { + "epoch": 0.9496912337748128, + "grad_norm": 6.391426086425781, + "learning_rate": 3.417181277041979e-05, + "loss": 2.8521, + "step": 3055000 + }, + { + "epoch": 0.9498466660552997, + "grad_norm": 9.724116325378418, + "learning_rate": 3.4169222232411677e-05, + "loss": 2.8482, + "step": 3055500 + }, + { + "epoch": 0.9500020983357865, + "grad_norm": 9.311724662780762, + "learning_rate": 3.4166631694403564e-05, + "loss": 2.8514, + "step": 3056000 + }, + { + "epoch": 0.9501575306162734, + "grad_norm": 10.251517295837402, + "learning_rate": 3.4164041156395444e-05, + "loss": 2.8938, + "step": 3056500 + }, + { + "epoch": 0.9503129628967604, + "grad_norm": 8.171961784362793, + "learning_rate": 3.416145061838733e-05, + "loss": 2.8661, + "step": 3057000 + }, + { + "epoch": 0.9504683951772472, + "grad_norm": 14.941003799438477, + "learning_rate": 3.415886008037921e-05, + "loss": 2.8741, + "step": 3057500 + }, + { + "epoch": 0.9506238274577341, + "grad_norm": 8.057555198669434, + "learning_rate": 3.41562695423711e-05, + "loss": 2.8901, + "step": 3058000 + }, + { + "epoch": 0.950779259738221, + "grad_norm": 8.294917106628418, + "learning_rate": 3.4153679004362986e-05, + "loss": 2.8894, + "step": 3058500 + }, + { + "epoch": 0.9509346920187078, + "grad_norm": 9.771260261535645, + "learning_rate": 3.4151088466354866e-05, + "loss": 2.8934, + "step": 3059000 + }, + { + "epoch": 0.9510901242991947, + "grad_norm": 7.850129127502441, + "learning_rate": 3.4148497928346753e-05, + "loss": 2.8922, + "step": 3059500 + }, + { + "epoch": 0.9512455565796816, + "grad_norm": 8.514156341552734, + "learning_rate": 3.414590739033864e-05, + "loss": 2.8795, + "step": 3060000 + }, + { + "epoch": 0.9514009888601684, + "grad_norm": 7.754580020904541, + "learning_rate": 3.414331685233053e-05, + "loss": 2.8689, + "step": 3060500 + }, + { + "epoch": 0.9515564211406553, + "grad_norm": 7.886673927307129, + "learning_rate": 3.4140726314322415e-05, + "loss": 2.8965, + "step": 3061000 + }, + { + "epoch": 0.9517118534211422, + "grad_norm": 9.306175231933594, + "learning_rate": 3.41381357763143e-05, + "loss": 2.934, + "step": 3061500 + }, + { + "epoch": 0.951867285701629, + "grad_norm": 7.296777725219727, + "learning_rate": 3.413554523830618e-05, + "loss": 2.8738, + "step": 3062000 + }, + { + "epoch": 0.9520227179821159, + "grad_norm": 6.601929187774658, + "learning_rate": 3.413295470029807e-05, + "loss": 2.8878, + "step": 3062500 + }, + { + "epoch": 0.9521781502626029, + "grad_norm": 9.333993911743164, + "learning_rate": 3.413036416228995e-05, + "loss": 2.8738, + "step": 3063000 + }, + { + "epoch": 0.9523335825430898, + "grad_norm": 12.64460563659668, + "learning_rate": 3.412777362428184e-05, + "loss": 2.8723, + "step": 3063500 + }, + { + "epoch": 0.9524890148235766, + "grad_norm": 14.185647964477539, + "learning_rate": 3.4125183086273724e-05, + "loss": 2.8935, + "step": 3064000 + }, + { + "epoch": 0.9526444471040635, + "grad_norm": 7.969057559967041, + "learning_rate": 3.412259254826561e-05, + "loss": 2.901, + "step": 3064500 + }, + { + "epoch": 0.9527998793845504, + "grad_norm": 7.9769978523254395, + "learning_rate": 3.41200020102575e-05, + "loss": 2.8678, + "step": 3065000 + }, + { + "epoch": 0.9529553116650372, + "grad_norm": 6.904517650604248, + "learning_rate": 3.4117411472249386e-05, + "loss": 2.8416, + "step": 3065500 + }, + { + "epoch": 0.9531107439455241, + "grad_norm": 12.155352592468262, + "learning_rate": 3.4114820934241266e-05, + "loss": 2.8824, + "step": 3066000 + }, + { + "epoch": 0.953266176226011, + "grad_norm": 9.217001914978027, + "learning_rate": 3.411223039623315e-05, + "loss": 2.8498, + "step": 3066500 + }, + { + "epoch": 0.9534216085064978, + "grad_norm": 7.633697986602783, + "learning_rate": 3.410963985822504e-05, + "loss": 2.8523, + "step": 3067000 + }, + { + "epoch": 0.9535770407869847, + "grad_norm": 13.379117012023926, + "learning_rate": 3.410704932021692e-05, + "loss": 2.911, + "step": 3067500 + }, + { + "epoch": 0.9537324730674716, + "grad_norm": 11.370670318603516, + "learning_rate": 3.410445878220881e-05, + "loss": 2.8684, + "step": 3068000 + }, + { + "epoch": 0.9538879053479584, + "grad_norm": 11.201325416564941, + "learning_rate": 3.4101868244200695e-05, + "loss": 2.817, + "step": 3068500 + }, + { + "epoch": 0.9540433376284454, + "grad_norm": 7.675537586212158, + "learning_rate": 3.4099277706192575e-05, + "loss": 2.8826, + "step": 3069000 + }, + { + "epoch": 0.9541987699089323, + "grad_norm": 7.047304630279541, + "learning_rate": 3.409668716818446e-05, + "loss": 2.8633, + "step": 3069500 + }, + { + "epoch": 0.9543542021894191, + "grad_norm": 7.114480018615723, + "learning_rate": 3.409409663017635e-05, + "loss": 2.9083, + "step": 3070000 + }, + { + "epoch": 0.954509634469906, + "grad_norm": 7.492720127105713, + "learning_rate": 3.409150609216824e-05, + "loss": 2.9031, + "step": 3070500 + }, + { + "epoch": 0.9546650667503929, + "grad_norm": 105.93660736083984, + "learning_rate": 3.4088915554160124e-05, + "loss": 2.8896, + "step": 3071000 + }, + { + "epoch": 0.9548204990308797, + "grad_norm": 10.742931365966797, + "learning_rate": 3.4086325016152004e-05, + "loss": 2.8943, + "step": 3071500 + }, + { + "epoch": 0.9549759313113666, + "grad_norm": 11.24653434753418, + "learning_rate": 3.408373447814389e-05, + "loss": 2.854, + "step": 3072000 + }, + { + "epoch": 0.9551313635918535, + "grad_norm": 7.6856184005737305, + "learning_rate": 3.408114394013578e-05, + "loss": 2.8578, + "step": 3072500 + }, + { + "epoch": 0.9552867958723403, + "grad_norm": 8.57555103302002, + "learning_rate": 3.407855340212766e-05, + "loss": 2.8983, + "step": 3073000 + }, + { + "epoch": 0.9554422281528272, + "grad_norm": 11.161865234375, + "learning_rate": 3.4075962864119546e-05, + "loss": 2.8562, + "step": 3073500 + }, + { + "epoch": 0.9555976604333141, + "grad_norm": 6.925288200378418, + "learning_rate": 3.4073372326111433e-05, + "loss": 2.8591, + "step": 3074000 + }, + { + "epoch": 0.9557530927138009, + "grad_norm": 12.001543998718262, + "learning_rate": 3.407078178810332e-05, + "loss": 2.8703, + "step": 3074500 + }, + { + "epoch": 0.9559085249942879, + "grad_norm": 9.767850875854492, + "learning_rate": 3.406819125009521e-05, + "loss": 2.8792, + "step": 3075000 + }, + { + "epoch": 0.9560639572747748, + "grad_norm": 7.628316402435303, + "learning_rate": 3.406560071208709e-05, + "loss": 2.9276, + "step": 3075500 + }, + { + "epoch": 0.9562193895552616, + "grad_norm": 9.651199340820312, + "learning_rate": 3.4063010174078975e-05, + "loss": 2.8257, + "step": 3076000 + }, + { + "epoch": 0.9563748218357485, + "grad_norm": 10.12826156616211, + "learning_rate": 3.406041963607086e-05, + "loss": 2.8699, + "step": 3076500 + }, + { + "epoch": 0.9565302541162354, + "grad_norm": 8.86946964263916, + "learning_rate": 3.405782909806274e-05, + "loss": 2.8582, + "step": 3077000 + }, + { + "epoch": 0.9566856863967222, + "grad_norm": 12.218640327453613, + "learning_rate": 3.405523856005463e-05, + "loss": 2.884, + "step": 3077500 + }, + { + "epoch": 0.9568411186772091, + "grad_norm": 8.255889892578125, + "learning_rate": 3.405264802204652e-05, + "loss": 2.8728, + "step": 3078000 + }, + { + "epoch": 0.956996550957696, + "grad_norm": 8.698092460632324, + "learning_rate": 3.40500574840384e-05, + "loss": 2.9022, + "step": 3078500 + }, + { + "epoch": 0.9571519832381828, + "grad_norm": 6.665513038635254, + "learning_rate": 3.4047466946030285e-05, + "loss": 2.9441, + "step": 3079000 + }, + { + "epoch": 0.9573074155186697, + "grad_norm": 10.397817611694336, + "learning_rate": 3.404487640802217e-05, + "loss": 2.8852, + "step": 3079500 + }, + { + "epoch": 0.9574628477991566, + "grad_norm": 12.202007293701172, + "learning_rate": 3.404228587001406e-05, + "loss": 2.8522, + "step": 3080000 + }, + { + "epoch": 0.9576182800796434, + "grad_norm": 10.018143653869629, + "learning_rate": 3.4039695332005946e-05, + "loss": 2.8318, + "step": 3080500 + }, + { + "epoch": 0.9577737123601304, + "grad_norm": 8.928736686706543, + "learning_rate": 3.4037104793997827e-05, + "loss": 2.9068, + "step": 3081000 + }, + { + "epoch": 0.9579291446406173, + "grad_norm": 11.86568546295166, + "learning_rate": 3.4034514255989714e-05, + "loss": 2.8948, + "step": 3081500 + }, + { + "epoch": 0.9580845769211042, + "grad_norm": 8.370084762573242, + "learning_rate": 3.40319237179816e-05, + "loss": 2.9028, + "step": 3082000 + }, + { + "epoch": 0.958240009201591, + "grad_norm": 8.931103706359863, + "learning_rate": 3.402933317997348e-05, + "loss": 2.8586, + "step": 3082500 + }, + { + "epoch": 0.9583954414820779, + "grad_norm": 9.088334083557129, + "learning_rate": 3.402674264196537e-05, + "loss": 2.8666, + "step": 3083000 + }, + { + "epoch": 0.9585508737625648, + "grad_norm": 9.102568626403809, + "learning_rate": 3.4024152103957256e-05, + "loss": 2.8753, + "step": 3083500 + }, + { + "epoch": 0.9587063060430516, + "grad_norm": 11.555868148803711, + "learning_rate": 3.402156156594914e-05, + "loss": 2.8858, + "step": 3084000 + }, + { + "epoch": 0.9588617383235385, + "grad_norm": 8.12809944152832, + "learning_rate": 3.401897102794103e-05, + "loss": 2.8896, + "step": 3084500 + }, + { + "epoch": 0.9590171706040254, + "grad_norm": 10.97871208190918, + "learning_rate": 3.401638048993292e-05, + "loss": 2.8361, + "step": 3085000 + }, + { + "epoch": 0.9591726028845122, + "grad_norm": 7.961181640625, + "learning_rate": 3.40137899519248e-05, + "loss": 2.8955, + "step": 3085500 + }, + { + "epoch": 0.9593280351649991, + "grad_norm": 7.012466907501221, + "learning_rate": 3.4011199413916684e-05, + "loss": 2.9219, + "step": 3086000 + }, + { + "epoch": 0.959483467445486, + "grad_norm": 8.066007614135742, + "learning_rate": 3.400860887590857e-05, + "loss": 2.9354, + "step": 3086500 + }, + { + "epoch": 0.9596388997259729, + "grad_norm": 9.36927318572998, + "learning_rate": 3.400601833790045e-05, + "loss": 2.8599, + "step": 3087000 + }, + { + "epoch": 0.9597943320064598, + "grad_norm": 7.7574920654296875, + "learning_rate": 3.400342779989234e-05, + "loss": 2.8571, + "step": 3087500 + }, + { + "epoch": 0.9599497642869467, + "grad_norm": 9.240211486816406, + "learning_rate": 3.400083726188422e-05, + "loss": 2.8845, + "step": 3088000 + }, + { + "epoch": 0.9601051965674335, + "grad_norm": 8.313228607177734, + "learning_rate": 3.399824672387611e-05, + "loss": 2.9139, + "step": 3088500 + }, + { + "epoch": 0.9602606288479204, + "grad_norm": 7.353842258453369, + "learning_rate": 3.3995656185867994e-05, + "loss": 2.8496, + "step": 3089000 + }, + { + "epoch": 0.9604160611284073, + "grad_norm": 6.213408470153809, + "learning_rate": 3.399306564785988e-05, + "loss": 2.8737, + "step": 3089500 + }, + { + "epoch": 0.9605714934088941, + "grad_norm": 6.930663108825684, + "learning_rate": 3.399047510985177e-05, + "loss": 2.8931, + "step": 3090000 + }, + { + "epoch": 0.960726925689381, + "grad_norm": 9.38718032836914, + "learning_rate": 3.3987884571843655e-05, + "loss": 2.9022, + "step": 3090500 + }, + { + "epoch": 0.9608823579698679, + "grad_norm": 9.749229431152344, + "learning_rate": 3.3985294033835536e-05, + "loss": 2.8679, + "step": 3091000 + }, + { + "epoch": 0.9610377902503547, + "grad_norm": 8.010993957519531, + "learning_rate": 3.398270349582742e-05, + "loss": 2.8518, + "step": 3091500 + }, + { + "epoch": 0.9611932225308416, + "grad_norm": 6.801774501800537, + "learning_rate": 3.398011295781931e-05, + "loss": 2.8308, + "step": 3092000 + }, + { + "epoch": 0.9613486548113285, + "grad_norm": 9.992043495178223, + "learning_rate": 3.397752241981119e-05, + "loss": 2.915, + "step": 3092500 + }, + { + "epoch": 0.9615040870918155, + "grad_norm": 19.81584930419922, + "learning_rate": 3.397493188180308e-05, + "loss": 2.8952, + "step": 3093000 + }, + { + "epoch": 0.9616595193723023, + "grad_norm": 5.7262282371521, + "learning_rate": 3.397234134379496e-05, + "loss": 2.8932, + "step": 3093500 + }, + { + "epoch": 0.9618149516527892, + "grad_norm": 12.608380317687988, + "learning_rate": 3.396975080578685e-05, + "loss": 2.9138, + "step": 3094000 + }, + { + "epoch": 0.961970383933276, + "grad_norm": 13.406660079956055, + "learning_rate": 3.396716026777874e-05, + "loss": 2.873, + "step": 3094500 + }, + { + "epoch": 0.9621258162137629, + "grad_norm": 8.174551010131836, + "learning_rate": 3.396456972977062e-05, + "loss": 2.9269, + "step": 3095000 + }, + { + "epoch": 0.9622812484942498, + "grad_norm": 7.121252059936523, + "learning_rate": 3.3961979191762507e-05, + "loss": 2.8279, + "step": 3095500 + }, + { + "epoch": 0.9624366807747367, + "grad_norm": 12.470903396606445, + "learning_rate": 3.3959388653754394e-05, + "loss": 2.8565, + "step": 3096000 + }, + { + "epoch": 0.9625921130552235, + "grad_norm": 14.378569602966309, + "learning_rate": 3.3956798115746274e-05, + "loss": 2.8944, + "step": 3096500 + }, + { + "epoch": 0.9627475453357104, + "grad_norm": 7.424893856048584, + "learning_rate": 3.395420757773816e-05, + "loss": 2.8617, + "step": 3097000 + }, + { + "epoch": 0.9629029776161973, + "grad_norm": 9.99614429473877, + "learning_rate": 3.395161703973005e-05, + "loss": 2.8926, + "step": 3097500 + }, + { + "epoch": 0.9630584098966841, + "grad_norm": 10.70880126953125, + "learning_rate": 3.394902650172193e-05, + "loss": 2.9035, + "step": 3098000 + }, + { + "epoch": 0.963213842177171, + "grad_norm": 9.50721549987793, + "learning_rate": 3.3946435963713816e-05, + "loss": 2.8893, + "step": 3098500 + }, + { + "epoch": 0.963369274457658, + "grad_norm": 9.991889953613281, + "learning_rate": 3.39438454257057e-05, + "loss": 2.871, + "step": 3099000 + }, + { + "epoch": 0.9635247067381448, + "grad_norm": 8.368491172790527, + "learning_rate": 3.394125488769759e-05, + "loss": 2.8567, + "step": 3099500 + }, + { + "epoch": 0.9636801390186317, + "grad_norm": 8.931339263916016, + "learning_rate": 3.393866434968948e-05, + "loss": 2.855, + "step": 3100000 + }, + { + "epoch": 0.9638355712991186, + "grad_norm": 7.6577372550964355, + "learning_rate": 3.393607381168136e-05, + "loss": 2.8852, + "step": 3100500 + }, + { + "epoch": 0.9639910035796054, + "grad_norm": 8.90649700164795, + "learning_rate": 3.3933483273673245e-05, + "loss": 2.8609, + "step": 3101000 + }, + { + "epoch": 0.9641464358600923, + "grad_norm": 8.663677215576172, + "learning_rate": 3.393089273566513e-05, + "loss": 2.8522, + "step": 3101500 + }, + { + "epoch": 0.9643018681405792, + "grad_norm": 9.105076789855957, + "learning_rate": 3.392830219765701e-05, + "loss": 2.8668, + "step": 3102000 + }, + { + "epoch": 0.964457300421066, + "grad_norm": 7.879927158355713, + "learning_rate": 3.39257116596489e-05, + "loss": 2.8726, + "step": 3102500 + }, + { + "epoch": 0.9646127327015529, + "grad_norm": 8.130962371826172, + "learning_rate": 3.392312112164079e-05, + "loss": 2.8338, + "step": 3103000 + }, + { + "epoch": 0.9647681649820398, + "grad_norm": 7.380917072296143, + "learning_rate": 3.392053058363267e-05, + "loss": 2.8716, + "step": 3103500 + }, + { + "epoch": 0.9649235972625266, + "grad_norm": 18.682607650756836, + "learning_rate": 3.391794004562456e-05, + "loss": 2.8563, + "step": 3104000 + }, + { + "epoch": 0.9650790295430135, + "grad_norm": 30.78267478942871, + "learning_rate": 3.391534950761645e-05, + "loss": 2.8667, + "step": 3104500 + }, + { + "epoch": 0.9652344618235005, + "grad_norm": 6.6802825927734375, + "learning_rate": 3.391275896960833e-05, + "loss": 2.9053, + "step": 3105000 + }, + { + "epoch": 0.9653898941039873, + "grad_norm": 8.20405101776123, + "learning_rate": 3.3910168431600216e-05, + "loss": 2.7998, + "step": 3105500 + }, + { + "epoch": 0.9655453263844742, + "grad_norm": 8.218668937683105, + "learning_rate": 3.3907577893592096e-05, + "loss": 2.9123, + "step": 3106000 + }, + { + "epoch": 0.9657007586649611, + "grad_norm": 11.468144416809082, + "learning_rate": 3.390498735558398e-05, + "loss": 2.8729, + "step": 3106500 + }, + { + "epoch": 0.965856190945448, + "grad_norm": 10.747625350952148, + "learning_rate": 3.390239681757587e-05, + "loss": 2.8803, + "step": 3107000 + }, + { + "epoch": 0.9660116232259348, + "grad_norm": 17.54783058166504, + "learning_rate": 3.389980627956775e-05, + "loss": 2.903, + "step": 3107500 + }, + { + "epoch": 0.9661670555064217, + "grad_norm": 5.565323352813721, + "learning_rate": 3.389721574155964e-05, + "loss": 2.8423, + "step": 3108000 + }, + { + "epoch": 0.9663224877869085, + "grad_norm": 7.346284866333008, + "learning_rate": 3.3894625203551525e-05, + "loss": 2.9034, + "step": 3108500 + }, + { + "epoch": 0.9664779200673954, + "grad_norm": 9.751054763793945, + "learning_rate": 3.389203466554341e-05, + "loss": 2.8584, + "step": 3109000 + }, + { + "epoch": 0.9666333523478823, + "grad_norm": 7.7128753662109375, + "learning_rate": 3.38894441275353e-05, + "loss": 2.8902, + "step": 3109500 + }, + { + "epoch": 0.9667887846283691, + "grad_norm": 8.080461502075195, + "learning_rate": 3.3886853589527187e-05, + "loss": 2.8978, + "step": 3110000 + }, + { + "epoch": 0.966944216908856, + "grad_norm": 8.069084167480469, + "learning_rate": 3.388426305151907e-05, + "loss": 2.9214, + "step": 3110500 + }, + { + "epoch": 0.967099649189343, + "grad_norm": 12.208199501037598, + "learning_rate": 3.3881672513510954e-05, + "loss": 2.9285, + "step": 3111000 + }, + { + "epoch": 0.9672550814698299, + "grad_norm": 9.15051555633545, + "learning_rate": 3.3879081975502834e-05, + "loss": 2.8388, + "step": 3111500 + }, + { + "epoch": 0.9674105137503167, + "grad_norm": 9.439848899841309, + "learning_rate": 3.387649143749472e-05, + "loss": 2.8471, + "step": 3112000 + }, + { + "epoch": 0.9675659460308036, + "grad_norm": 10.613800048828125, + "learning_rate": 3.387390089948661e-05, + "loss": 2.8801, + "step": 3112500 + }, + { + "epoch": 0.9677213783112905, + "grad_norm": 12.419949531555176, + "learning_rate": 3.387131036147849e-05, + "loss": 2.8746, + "step": 3113000 + }, + { + "epoch": 0.9678768105917773, + "grad_norm": 7.747188091278076, + "learning_rate": 3.3868719823470376e-05, + "loss": 2.8984, + "step": 3113500 + }, + { + "epoch": 0.9680322428722642, + "grad_norm": 12.954083442687988, + "learning_rate": 3.386612928546227e-05, + "loss": 2.8652, + "step": 3114000 + }, + { + "epoch": 0.9681876751527511, + "grad_norm": 8.120155334472656, + "learning_rate": 3.386353874745415e-05, + "loss": 2.8859, + "step": 3114500 + }, + { + "epoch": 0.9683431074332379, + "grad_norm": 11.558084487915039, + "learning_rate": 3.386094820944604e-05, + "loss": 2.9047, + "step": 3115000 + }, + { + "epoch": 0.9684985397137248, + "grad_norm": 8.217503547668457, + "learning_rate": 3.3858357671437925e-05, + "loss": 2.9082, + "step": 3115500 + }, + { + "epoch": 0.9686539719942117, + "grad_norm": 10.157495498657227, + "learning_rate": 3.3855767133429805e-05, + "loss": 2.9094, + "step": 3116000 + }, + { + "epoch": 0.9688094042746985, + "grad_norm": 9.094473838806152, + "learning_rate": 3.385317659542169e-05, + "loss": 2.915, + "step": 3116500 + }, + { + "epoch": 0.9689648365551855, + "grad_norm": 7.000076770782471, + "learning_rate": 3.385058605741357e-05, + "loss": 2.8854, + "step": 3117000 + }, + { + "epoch": 0.9691202688356724, + "grad_norm": 7.560038089752197, + "learning_rate": 3.384799551940546e-05, + "loss": 2.9006, + "step": 3117500 + }, + { + "epoch": 0.9692757011161592, + "grad_norm": 36.14039993286133, + "learning_rate": 3.384540498139735e-05, + "loss": 2.9137, + "step": 3118000 + }, + { + "epoch": 0.9694311333966461, + "grad_norm": 10.760619163513184, + "learning_rate": 3.3842814443389234e-05, + "loss": 2.8795, + "step": 3118500 + }, + { + "epoch": 0.969586565677133, + "grad_norm": 7.619880199432373, + "learning_rate": 3.384022390538112e-05, + "loss": 2.86, + "step": 3119000 + }, + { + "epoch": 0.9697419979576198, + "grad_norm": 10.616905212402344, + "learning_rate": 3.383763336737301e-05, + "loss": 2.8702, + "step": 3119500 + }, + { + "epoch": 0.9698974302381067, + "grad_norm": 7.889766216278076, + "learning_rate": 3.383504282936489e-05, + "loss": 2.8714, + "step": 3120000 + }, + { + "epoch": 0.9700528625185936, + "grad_norm": 10.018810272216797, + "learning_rate": 3.3832452291356776e-05, + "loss": 2.8522, + "step": 3120500 + }, + { + "epoch": 0.9702082947990804, + "grad_norm": 9.49222469329834, + "learning_rate": 3.382986175334866e-05, + "loss": 2.8746, + "step": 3121000 + }, + { + "epoch": 0.9703637270795673, + "grad_norm": 14.90281867980957, + "learning_rate": 3.3827271215340544e-05, + "loss": 2.9453, + "step": 3121500 + }, + { + "epoch": 0.9705191593600542, + "grad_norm": 10.428327560424805, + "learning_rate": 3.382468067733243e-05, + "loss": 2.8702, + "step": 3122000 + }, + { + "epoch": 0.970674591640541, + "grad_norm": 14.320651054382324, + "learning_rate": 3.382209013932432e-05, + "loss": 2.8893, + "step": 3122500 + }, + { + "epoch": 0.970830023921028, + "grad_norm": 8.93087100982666, + "learning_rate": 3.38194996013162e-05, + "loss": 2.8654, + "step": 3123000 + }, + { + "epoch": 0.9709854562015149, + "grad_norm": 14.160866737365723, + "learning_rate": 3.3816909063308085e-05, + "loss": 2.8804, + "step": 3123500 + }, + { + "epoch": 0.9711408884820018, + "grad_norm": 6.861609935760498, + "learning_rate": 3.381431852529997e-05, + "loss": 2.8859, + "step": 3124000 + }, + { + "epoch": 0.9712963207624886, + "grad_norm": 10.464085578918457, + "learning_rate": 3.381172798729186e-05, + "loss": 2.8496, + "step": 3124500 + }, + { + "epoch": 0.9714517530429755, + "grad_norm": 7.183316230773926, + "learning_rate": 3.380913744928375e-05, + "loss": 2.8438, + "step": 3125000 + }, + { + "epoch": 0.9716071853234624, + "grad_norm": 6.910099983215332, + "learning_rate": 3.380654691127563e-05, + "loss": 2.8517, + "step": 3125500 + }, + { + "epoch": 0.9717626176039492, + "grad_norm": 7.679991722106934, + "learning_rate": 3.3803956373267514e-05, + "loss": 2.889, + "step": 3126000 + }, + { + "epoch": 0.9719180498844361, + "grad_norm": 32.053226470947266, + "learning_rate": 3.38013658352594e-05, + "loss": 2.8457, + "step": 3126500 + }, + { + "epoch": 0.972073482164923, + "grad_norm": 6.6872334480285645, + "learning_rate": 3.379877529725128e-05, + "loss": 2.8778, + "step": 3127000 + }, + { + "epoch": 0.9722289144454098, + "grad_norm": 11.573616981506348, + "learning_rate": 3.379618475924317e-05, + "loss": 2.8834, + "step": 3127500 + }, + { + "epoch": 0.9723843467258967, + "grad_norm": 6.142255783081055, + "learning_rate": 3.3793594221235056e-05, + "loss": 2.8658, + "step": 3128000 + }, + { + "epoch": 0.9725397790063836, + "grad_norm": 14.486783981323242, + "learning_rate": 3.3791003683226943e-05, + "loss": 2.8712, + "step": 3128500 + }, + { + "epoch": 0.9726952112868705, + "grad_norm": 6.840188980102539, + "learning_rate": 3.378841314521883e-05, + "loss": 2.8971, + "step": 3129000 + }, + { + "epoch": 0.9728506435673574, + "grad_norm": 8.582557678222656, + "learning_rate": 3.378582260721071e-05, + "loss": 2.8728, + "step": 3129500 + }, + { + "epoch": 0.9730060758478443, + "grad_norm": 12.885784149169922, + "learning_rate": 3.37832320692026e-05, + "loss": 2.8841, + "step": 3130000 + }, + { + "epoch": 0.9731615081283311, + "grad_norm": 7.6937360763549805, + "learning_rate": 3.3780641531194485e-05, + "loss": 2.8611, + "step": 3130500 + }, + { + "epoch": 0.973316940408818, + "grad_norm": 11.732239723205566, + "learning_rate": 3.3778050993186366e-05, + "loss": 2.903, + "step": 3131000 + }, + { + "epoch": 0.9734723726893049, + "grad_norm": 11.257966041564941, + "learning_rate": 3.377546045517825e-05, + "loss": 2.8714, + "step": 3131500 + }, + { + "epoch": 0.9736278049697917, + "grad_norm": 8.326568603515625, + "learning_rate": 3.377286991717014e-05, + "loss": 2.8827, + "step": 3132000 + }, + { + "epoch": 0.9737832372502786, + "grad_norm": 8.668469429016113, + "learning_rate": 3.377027937916202e-05, + "loss": 2.8568, + "step": 3132500 + }, + { + "epoch": 0.9739386695307655, + "grad_norm": 8.78614616394043, + "learning_rate": 3.376768884115391e-05, + "loss": 2.8627, + "step": 3133000 + }, + { + "epoch": 0.9740941018112523, + "grad_norm": 9.962434768676758, + "learning_rate": 3.3765098303145795e-05, + "loss": 2.8969, + "step": 3133500 + }, + { + "epoch": 0.9742495340917392, + "grad_norm": 20.162818908691406, + "learning_rate": 3.376250776513768e-05, + "loss": 2.8337, + "step": 3134000 + }, + { + "epoch": 0.9744049663722261, + "grad_norm": 23.614208221435547, + "learning_rate": 3.375991722712957e-05, + "loss": 2.8825, + "step": 3134500 + }, + { + "epoch": 0.974560398652713, + "grad_norm": 8.038070678710938, + "learning_rate": 3.375732668912145e-05, + "loss": 2.8832, + "step": 3135000 + }, + { + "epoch": 0.9747158309331999, + "grad_norm": 7.664644718170166, + "learning_rate": 3.3754736151113336e-05, + "loss": 2.9, + "step": 3135500 + }, + { + "epoch": 0.9748712632136868, + "grad_norm": 9.022710800170898, + "learning_rate": 3.3752145613105224e-05, + "loss": 2.8841, + "step": 3136000 + }, + { + "epoch": 0.9750266954941736, + "grad_norm": 9.623568534851074, + "learning_rate": 3.3749555075097104e-05, + "loss": 2.8959, + "step": 3136500 + }, + { + "epoch": 0.9751821277746605, + "grad_norm": 7.53161096572876, + "learning_rate": 3.374696453708899e-05, + "loss": 2.8116, + "step": 3137000 + }, + { + "epoch": 0.9753375600551474, + "grad_norm": 7.297502040863037, + "learning_rate": 3.374437399908088e-05, + "loss": 2.8448, + "step": 3137500 + }, + { + "epoch": 0.9754929923356342, + "grad_norm": 7.605632781982422, + "learning_rate": 3.3741783461072765e-05, + "loss": 2.8698, + "step": 3138000 + }, + { + "epoch": 0.9756484246161211, + "grad_norm": 10.303030014038086, + "learning_rate": 3.373919292306465e-05, + "loss": 2.888, + "step": 3138500 + }, + { + "epoch": 0.975803856896608, + "grad_norm": 10.788385391235352, + "learning_rate": 3.373660238505654e-05, + "loss": 2.8604, + "step": 3139000 + }, + { + "epoch": 0.9759592891770948, + "grad_norm": 9.352005004882812, + "learning_rate": 3.373401184704842e-05, + "loss": 2.9198, + "step": 3139500 + }, + { + "epoch": 0.9761147214575817, + "grad_norm": 7.9051432609558105, + "learning_rate": 3.373142130904031e-05, + "loss": 2.8345, + "step": 3140000 + }, + { + "epoch": 0.9762701537380686, + "grad_norm": 10.15001106262207, + "learning_rate": 3.3728830771032194e-05, + "loss": 2.8373, + "step": 3140500 + }, + { + "epoch": 0.9764255860185556, + "grad_norm": 6.800963401794434, + "learning_rate": 3.3726240233024075e-05, + "loss": 2.8892, + "step": 3141000 + }, + { + "epoch": 0.9765810182990424, + "grad_norm": 22.071964263916016, + "learning_rate": 3.372364969501596e-05, + "loss": 2.8873, + "step": 3141500 + }, + { + "epoch": 0.9767364505795293, + "grad_norm": 8.883777618408203, + "learning_rate": 3.372105915700784e-05, + "loss": 2.8713, + "step": 3142000 + }, + { + "epoch": 0.9768918828600162, + "grad_norm": 11.183209419250488, + "learning_rate": 3.371846861899973e-05, + "loss": 2.8499, + "step": 3142500 + }, + { + "epoch": 0.977047315140503, + "grad_norm": 10.11511516571045, + "learning_rate": 3.371587808099162e-05, + "loss": 2.8568, + "step": 3143000 + }, + { + "epoch": 0.9772027474209899, + "grad_norm": 8.416482925415039, + "learning_rate": 3.3713287542983504e-05, + "loss": 2.8789, + "step": 3143500 + }, + { + "epoch": 0.9773581797014768, + "grad_norm": 8.062088966369629, + "learning_rate": 3.371069700497539e-05, + "loss": 2.8883, + "step": 3144000 + }, + { + "epoch": 0.9775136119819636, + "grad_norm": 9.418937683105469, + "learning_rate": 3.370810646696728e-05, + "loss": 2.8976, + "step": 3144500 + }, + { + "epoch": 0.9776690442624505, + "grad_norm": 8.869987487792969, + "learning_rate": 3.370551592895916e-05, + "loss": 2.8948, + "step": 3145000 + }, + { + "epoch": 0.9778244765429374, + "grad_norm": 7.999192237854004, + "learning_rate": 3.3702925390951046e-05, + "loss": 2.8675, + "step": 3145500 + }, + { + "epoch": 0.9779799088234242, + "grad_norm": 8.638005256652832, + "learning_rate": 3.370033485294293e-05, + "loss": 2.897, + "step": 3146000 + }, + { + "epoch": 0.9781353411039111, + "grad_norm": 9.825629234313965, + "learning_rate": 3.369774431493481e-05, + "loss": 2.8434, + "step": 3146500 + }, + { + "epoch": 0.9782907733843981, + "grad_norm": 7.212456703186035, + "learning_rate": 3.36951537769267e-05, + "loss": 2.8698, + "step": 3147000 + }, + { + "epoch": 0.9784462056648849, + "grad_norm": 7.948273658752441, + "learning_rate": 3.369256323891858e-05, + "loss": 2.8696, + "step": 3147500 + }, + { + "epoch": 0.9786016379453718, + "grad_norm": 7.496802806854248, + "learning_rate": 3.3689972700910475e-05, + "loss": 2.9077, + "step": 3148000 + }, + { + "epoch": 0.9787570702258587, + "grad_norm": 7.758663177490234, + "learning_rate": 3.368738216290236e-05, + "loss": 2.8498, + "step": 3148500 + }, + { + "epoch": 0.9789125025063455, + "grad_norm": 9.625085830688477, + "learning_rate": 3.368479162489424e-05, + "loss": 2.8713, + "step": 3149000 + }, + { + "epoch": 0.9790679347868324, + "grad_norm": 7.265480995178223, + "learning_rate": 3.368220108688613e-05, + "loss": 2.8597, + "step": 3149500 + }, + { + "epoch": 0.9792233670673193, + "grad_norm": 7.015977382659912, + "learning_rate": 3.3679610548878017e-05, + "loss": 2.8691, + "step": 3150000 + }, + { + "epoch": 0.9793787993478061, + "grad_norm": 7.4615983963012695, + "learning_rate": 3.36770200108699e-05, + "loss": 2.8867, + "step": 3150500 + }, + { + "epoch": 0.979534231628293, + "grad_norm": 8.690483093261719, + "learning_rate": 3.3674429472861784e-05, + "loss": 2.8529, + "step": 3151000 + }, + { + "epoch": 0.9796896639087799, + "grad_norm": 10.733717918395996, + "learning_rate": 3.367183893485367e-05, + "loss": 2.9017, + "step": 3151500 + }, + { + "epoch": 0.9798450961892667, + "grad_norm": 15.431743621826172, + "learning_rate": 3.366924839684555e-05, + "loss": 2.873, + "step": 3152000 + }, + { + "epoch": 0.9800005284697536, + "grad_norm": 9.714391708374023, + "learning_rate": 3.366665785883744e-05, + "loss": 2.881, + "step": 3152500 + }, + { + "epoch": 0.9801559607502406, + "grad_norm": 7.689422130584717, + "learning_rate": 3.3664067320829326e-05, + "loss": 2.8657, + "step": 3153000 + }, + { + "epoch": 0.9803113930307275, + "grad_norm": 22.87643814086914, + "learning_rate": 3.366147678282121e-05, + "loss": 2.8795, + "step": 3153500 + }, + { + "epoch": 0.9804668253112143, + "grad_norm": 8.846226692199707, + "learning_rate": 3.36588862448131e-05, + "loss": 2.8902, + "step": 3154000 + }, + { + "epoch": 0.9806222575917012, + "grad_norm": 14.664680480957031, + "learning_rate": 3.365629570680498e-05, + "loss": 2.874, + "step": 3154500 + }, + { + "epoch": 0.980777689872188, + "grad_norm": 7.6423139572143555, + "learning_rate": 3.365370516879687e-05, + "loss": 2.8273, + "step": 3155000 + }, + { + "epoch": 0.9809331221526749, + "grad_norm": 7.131484508514404, + "learning_rate": 3.3651114630788755e-05, + "loss": 2.9132, + "step": 3155500 + }, + { + "epoch": 0.9810885544331618, + "grad_norm": 15.562799453735352, + "learning_rate": 3.3648524092780635e-05, + "loss": 2.9008, + "step": 3156000 + }, + { + "epoch": 0.9812439867136487, + "grad_norm": 8.045949935913086, + "learning_rate": 3.364593355477252e-05, + "loss": 2.8924, + "step": 3156500 + }, + { + "epoch": 0.9813994189941355, + "grad_norm": 10.679006576538086, + "learning_rate": 3.364334301676441e-05, + "loss": 2.8741, + "step": 3157000 + }, + { + "epoch": 0.9815548512746224, + "grad_norm": 10.804262161254883, + "learning_rate": 3.36407524787563e-05, + "loss": 2.8629, + "step": 3157500 + }, + { + "epoch": 0.9817102835551093, + "grad_norm": 7.796062469482422, + "learning_rate": 3.3638161940748184e-05, + "loss": 2.8884, + "step": 3158000 + }, + { + "epoch": 0.9818657158355961, + "grad_norm": 6.711696147918701, + "learning_rate": 3.363557140274007e-05, + "loss": 2.8625, + "step": 3158500 + }, + { + "epoch": 0.9820211481160831, + "grad_norm": 8.831467628479004, + "learning_rate": 3.363298086473195e-05, + "loss": 2.8162, + "step": 3159000 + }, + { + "epoch": 0.98217658039657, + "grad_norm": 10.221585273742676, + "learning_rate": 3.363039032672384e-05, + "loss": 2.8688, + "step": 3159500 + }, + { + "epoch": 0.9823320126770568, + "grad_norm": 7.322482585906982, + "learning_rate": 3.362779978871572e-05, + "loss": 2.9175, + "step": 3160000 + }, + { + "epoch": 0.9824874449575437, + "grad_norm": 5.633291721343994, + "learning_rate": 3.3625209250707606e-05, + "loss": 2.9241, + "step": 3160500 + }, + { + "epoch": 0.9826428772380306, + "grad_norm": 12.243720054626465, + "learning_rate": 3.362261871269949e-05, + "loss": 2.8561, + "step": 3161000 + }, + { + "epoch": 0.9827983095185174, + "grad_norm": 20.02893829345703, + "learning_rate": 3.3620028174691374e-05, + "loss": 2.8582, + "step": 3161500 + }, + { + "epoch": 0.9829537417990043, + "grad_norm": 8.583598136901855, + "learning_rate": 3.361743763668326e-05, + "loss": 2.8925, + "step": 3162000 + }, + { + "epoch": 0.9831091740794912, + "grad_norm": 10.849913597106934, + "learning_rate": 3.361484709867515e-05, + "loss": 2.8723, + "step": 3162500 + }, + { + "epoch": 0.983264606359978, + "grad_norm": 9.225594520568848, + "learning_rate": 3.3612256560667035e-05, + "loss": 2.8738, + "step": 3163000 + }, + { + "epoch": 0.9834200386404649, + "grad_norm": 7.925206184387207, + "learning_rate": 3.360966602265892e-05, + "loss": 2.9108, + "step": 3163500 + }, + { + "epoch": 0.9835754709209518, + "grad_norm": 10.573694229125977, + "learning_rate": 3.360707548465081e-05, + "loss": 2.8631, + "step": 3164000 + }, + { + "epoch": 0.9837309032014386, + "grad_norm": 8.13892650604248, + "learning_rate": 3.360448494664269e-05, + "loss": 2.9112, + "step": 3164500 + }, + { + "epoch": 0.9838863354819256, + "grad_norm": 9.65109634399414, + "learning_rate": 3.360189440863458e-05, + "loss": 2.9026, + "step": 3165000 + }, + { + "epoch": 0.9840417677624125, + "grad_norm": 9.445426940917969, + "learning_rate": 3.359930387062646e-05, + "loss": 2.8769, + "step": 3165500 + }, + { + "epoch": 0.9841972000428993, + "grad_norm": 5.739779949188232, + "learning_rate": 3.3596713332618344e-05, + "loss": 2.8354, + "step": 3166000 + }, + { + "epoch": 0.9843526323233862, + "grad_norm": 9.591711044311523, + "learning_rate": 3.359412279461023e-05, + "loss": 2.8619, + "step": 3166500 + }, + { + "epoch": 0.9845080646038731, + "grad_norm": 8.370671272277832, + "learning_rate": 3.359153225660211e-05, + "loss": 2.8899, + "step": 3167000 + }, + { + "epoch": 0.98466349688436, + "grad_norm": 9.294529914855957, + "learning_rate": 3.3588941718594006e-05, + "loss": 2.9141, + "step": 3167500 + }, + { + "epoch": 0.9848189291648468, + "grad_norm": 12.730295181274414, + "learning_rate": 3.358635118058589e-05, + "loss": 2.9279, + "step": 3168000 + }, + { + "epoch": 0.9849743614453337, + "grad_norm": 9.079634666442871, + "learning_rate": 3.3583760642577773e-05, + "loss": 2.8726, + "step": 3168500 + }, + { + "epoch": 0.9851297937258205, + "grad_norm": 14.926607131958008, + "learning_rate": 3.358117010456966e-05, + "loss": 2.8787, + "step": 3169000 + }, + { + "epoch": 0.9852852260063074, + "grad_norm": 12.683526039123535, + "learning_rate": 3.357857956656155e-05, + "loss": 2.8746, + "step": 3169500 + }, + { + "epoch": 0.9854406582867943, + "grad_norm": 6.917855262756348, + "learning_rate": 3.357598902855343e-05, + "loss": 2.8921, + "step": 3170000 + }, + { + "epoch": 0.9855960905672811, + "grad_norm": 8.783614158630371, + "learning_rate": 3.3573398490545315e-05, + "loss": 2.8076, + "step": 3170500 + }, + { + "epoch": 0.9857515228477681, + "grad_norm": 11.678174018859863, + "learning_rate": 3.3570807952537196e-05, + "loss": 2.8986, + "step": 3171000 + }, + { + "epoch": 0.985906955128255, + "grad_norm": 8.125876426696777, + "learning_rate": 3.356821741452908e-05, + "loss": 2.8275, + "step": 3171500 + }, + { + "epoch": 0.9860623874087419, + "grad_norm": 10.376504898071289, + "learning_rate": 3.356562687652097e-05, + "loss": 2.8763, + "step": 3172000 + }, + { + "epoch": 0.9862178196892287, + "grad_norm": 9.755495071411133, + "learning_rate": 3.356303633851286e-05, + "loss": 2.8525, + "step": 3172500 + }, + { + "epoch": 0.9863732519697156, + "grad_norm": 7.804410457611084, + "learning_rate": 3.3560445800504744e-05, + "loss": 2.862, + "step": 3173000 + }, + { + "epoch": 0.9865286842502025, + "grad_norm": 9.473995208740234, + "learning_rate": 3.355785526249663e-05, + "loss": 2.9069, + "step": 3173500 + }, + { + "epoch": 0.9866841165306893, + "grad_norm": 7.683796405792236, + "learning_rate": 3.355526472448851e-05, + "loss": 2.8882, + "step": 3174000 + }, + { + "epoch": 0.9868395488111762, + "grad_norm": 8.851447105407715, + "learning_rate": 3.35526741864804e-05, + "loss": 2.8882, + "step": 3174500 + }, + { + "epoch": 0.9869949810916631, + "grad_norm": 15.924004554748535, + "learning_rate": 3.3550083648472286e-05, + "loss": 2.8513, + "step": 3175000 + }, + { + "epoch": 0.9871504133721499, + "grad_norm": 7.423511505126953, + "learning_rate": 3.3547493110464166e-05, + "loss": 2.8703, + "step": 3175500 + }, + { + "epoch": 0.9873058456526368, + "grad_norm": 8.256367683410645, + "learning_rate": 3.3544902572456054e-05, + "loss": 2.8934, + "step": 3176000 + }, + { + "epoch": 0.9874612779331237, + "grad_norm": 9.998709678649902, + "learning_rate": 3.354231203444794e-05, + "loss": 2.8982, + "step": 3176500 + }, + { + "epoch": 0.9876167102136106, + "grad_norm": 8.914841651916504, + "learning_rate": 3.353972149643982e-05, + "loss": 2.9103, + "step": 3177000 + }, + { + "epoch": 0.9877721424940975, + "grad_norm": 14.234236717224121, + "learning_rate": 3.3537130958431715e-05, + "loss": 2.8985, + "step": 3177500 + }, + { + "epoch": 0.9879275747745844, + "grad_norm": 9.682780265808105, + "learning_rate": 3.3534540420423595e-05, + "loss": 2.8601, + "step": 3178000 + }, + { + "epoch": 0.9880830070550712, + "grad_norm": 11.635504722595215, + "learning_rate": 3.353194988241548e-05, + "loss": 2.9243, + "step": 3178500 + }, + { + "epoch": 0.9882384393355581, + "grad_norm": 10.372788429260254, + "learning_rate": 3.352935934440737e-05, + "loss": 2.9043, + "step": 3179000 + }, + { + "epoch": 0.988393871616045, + "grad_norm": 7.545138835906982, + "learning_rate": 3.352676880639925e-05, + "loss": 2.905, + "step": 3179500 + }, + { + "epoch": 0.9885493038965318, + "grad_norm": 12.800713539123535, + "learning_rate": 3.352417826839114e-05, + "loss": 2.9523, + "step": 3180000 + }, + { + "epoch": 0.9887047361770187, + "grad_norm": 20.990814208984375, + "learning_rate": 3.3521587730383024e-05, + "loss": 2.8616, + "step": 3180500 + }, + { + "epoch": 0.9888601684575056, + "grad_norm": 8.834709167480469, + "learning_rate": 3.3518997192374905e-05, + "loss": 2.8703, + "step": 3181000 + }, + { + "epoch": 0.9890156007379924, + "grad_norm": 9.887659072875977, + "learning_rate": 3.351640665436679e-05, + "loss": 2.8869, + "step": 3181500 + }, + { + "epoch": 0.9891710330184793, + "grad_norm": 14.858494758605957, + "learning_rate": 3.351381611635868e-05, + "loss": 2.846, + "step": 3182000 + }, + { + "epoch": 0.9893264652989662, + "grad_norm": 6.963049411773682, + "learning_rate": 3.3511225578350566e-05, + "loss": 2.8535, + "step": 3182500 + }, + { + "epoch": 0.9894818975794532, + "grad_norm": 8.398548126220703, + "learning_rate": 3.3508635040342453e-05, + "loss": 2.8412, + "step": 3183000 + }, + { + "epoch": 0.98963732985994, + "grad_norm": 24.957012176513672, + "learning_rate": 3.3506044502334334e-05, + "loss": 2.867, + "step": 3183500 + }, + { + "epoch": 0.9897927621404269, + "grad_norm": 8.10923957824707, + "learning_rate": 3.350345396432622e-05, + "loss": 2.8897, + "step": 3184000 + }, + { + "epoch": 0.9899481944209138, + "grad_norm": 8.884135246276855, + "learning_rate": 3.350086342631811e-05, + "loss": 2.8603, + "step": 3184500 + }, + { + "epoch": 0.9901036267014006, + "grad_norm": 7.221673488616943, + "learning_rate": 3.349827288830999e-05, + "loss": 2.8387, + "step": 3185000 + }, + { + "epoch": 0.9902590589818875, + "grad_norm": 12.35584831237793, + "learning_rate": 3.3495682350301876e-05, + "loss": 2.9074, + "step": 3185500 + }, + { + "epoch": 0.9904144912623744, + "grad_norm": 6.785740375518799, + "learning_rate": 3.349309181229376e-05, + "loss": 2.8523, + "step": 3186000 + }, + { + "epoch": 0.9905699235428612, + "grad_norm": 6.965710639953613, + "learning_rate": 3.349050127428564e-05, + "loss": 2.8489, + "step": 3186500 + }, + { + "epoch": 0.9907253558233481, + "grad_norm": 16.798107147216797, + "learning_rate": 3.348791073627753e-05, + "loss": 2.9033, + "step": 3187000 + }, + { + "epoch": 0.990880788103835, + "grad_norm": 7.692028999328613, + "learning_rate": 3.3485320198269424e-05, + "loss": 2.8396, + "step": 3187500 + }, + { + "epoch": 0.9910362203843218, + "grad_norm": 9.712327003479004, + "learning_rate": 3.3482729660261305e-05, + "loss": 2.8737, + "step": 3188000 + }, + { + "epoch": 0.9911916526648087, + "grad_norm": 8.845882415771484, + "learning_rate": 3.348013912225319e-05, + "loss": 2.9076, + "step": 3188500 + }, + { + "epoch": 0.9913470849452956, + "grad_norm": 8.441109657287598, + "learning_rate": 3.347754858424508e-05, + "loss": 2.8484, + "step": 3189000 + }, + { + "epoch": 0.9915025172257825, + "grad_norm": 11.198019027709961, + "learning_rate": 3.347495804623696e-05, + "loss": 2.8717, + "step": 3189500 + }, + { + "epoch": 0.9916579495062694, + "grad_norm": 10.91462516784668, + "learning_rate": 3.3472367508228846e-05, + "loss": 2.8523, + "step": 3190000 + }, + { + "epoch": 0.9918133817867563, + "grad_norm": 14.395614624023438, + "learning_rate": 3.346977697022073e-05, + "loss": 2.8322, + "step": 3190500 + }, + { + "epoch": 0.9919688140672431, + "grad_norm": 8.787210464477539, + "learning_rate": 3.3467186432212614e-05, + "loss": 2.9098, + "step": 3191000 + }, + { + "epoch": 0.99212424634773, + "grad_norm": 11.633357048034668, + "learning_rate": 3.34645958942045e-05, + "loss": 2.8598, + "step": 3191500 + }, + { + "epoch": 0.9922796786282169, + "grad_norm": 7.959670543670654, + "learning_rate": 3.346200535619639e-05, + "loss": 2.8268, + "step": 3192000 + }, + { + "epoch": 0.9924351109087037, + "grad_norm": 9.63972282409668, + "learning_rate": 3.3459414818188275e-05, + "loss": 2.8648, + "step": 3192500 + }, + { + "epoch": 0.9925905431891906, + "grad_norm": 8.732828140258789, + "learning_rate": 3.345682428018016e-05, + "loss": 2.8878, + "step": 3193000 + }, + { + "epoch": 0.9927459754696775, + "grad_norm": 9.326374053955078, + "learning_rate": 3.345423374217204e-05, + "loss": 2.8903, + "step": 3193500 + }, + { + "epoch": 0.9929014077501643, + "grad_norm": 10.257091522216797, + "learning_rate": 3.345164320416393e-05, + "loss": 2.8789, + "step": 3194000 + }, + { + "epoch": 0.9930568400306512, + "grad_norm": 8.600858688354492, + "learning_rate": 3.344905266615582e-05, + "loss": 2.878, + "step": 3194500 + }, + { + "epoch": 0.9932122723111381, + "grad_norm": 8.991179466247559, + "learning_rate": 3.34464621281477e-05, + "loss": 2.8523, + "step": 3195000 + }, + { + "epoch": 0.993367704591625, + "grad_norm": 8.470208168029785, + "learning_rate": 3.3443871590139585e-05, + "loss": 2.8825, + "step": 3195500 + }, + { + "epoch": 0.9935231368721119, + "grad_norm": 7.3270673751831055, + "learning_rate": 3.3441281052131465e-05, + "loss": 2.8708, + "step": 3196000 + }, + { + "epoch": 0.9936785691525988, + "grad_norm": 7.784712791442871, + "learning_rate": 3.343869051412335e-05, + "loss": 2.8714, + "step": 3196500 + }, + { + "epoch": 0.9938340014330856, + "grad_norm": 8.592342376708984, + "learning_rate": 3.343609997611524e-05, + "loss": 2.8987, + "step": 3197000 + }, + { + "epoch": 0.9939894337135725, + "grad_norm": 8.178077697753906, + "learning_rate": 3.343350943810713e-05, + "loss": 2.8875, + "step": 3197500 + }, + { + "epoch": 0.9941448659940594, + "grad_norm": 9.654512405395508, + "learning_rate": 3.3430918900099014e-05, + "loss": 2.9264, + "step": 3198000 + }, + { + "epoch": 0.9943002982745462, + "grad_norm": 10.393415451049805, + "learning_rate": 3.34283283620909e-05, + "loss": 2.8839, + "step": 3198500 + }, + { + "epoch": 0.9944557305550331, + "grad_norm": 6.653029441833496, + "learning_rate": 3.342573782408278e-05, + "loss": 2.8742, + "step": 3199000 + }, + { + "epoch": 0.99461116283552, + "grad_norm": 9.743016242980957, + "learning_rate": 3.342314728607467e-05, + "loss": 2.8919, + "step": 3199500 + }, + { + "epoch": 0.9947665951160068, + "grad_norm": 23.29438018798828, + "learning_rate": 3.3420556748066556e-05, + "loss": 2.8623, + "step": 3200000 + }, + { + "epoch": 0.9949220273964937, + "grad_norm": 10.138871192932129, + "learning_rate": 3.3417966210058436e-05, + "loss": 2.9265, + "step": 3200500 + }, + { + "epoch": 0.9950774596769806, + "grad_norm": 8.298201560974121, + "learning_rate": 3.341537567205032e-05, + "loss": 2.8606, + "step": 3201000 + }, + { + "epoch": 0.9952328919574676, + "grad_norm": 6.941729545593262, + "learning_rate": 3.341278513404221e-05, + "loss": 2.895, + "step": 3201500 + }, + { + "epoch": 0.9953883242379544, + "grad_norm": 10.513983726501465, + "learning_rate": 3.34101945960341e-05, + "loss": 2.8616, + "step": 3202000 + }, + { + "epoch": 0.9955437565184413, + "grad_norm": 19.22151756286621, + "learning_rate": 3.3407604058025985e-05, + "loss": 2.8323, + "step": 3202500 + }, + { + "epoch": 0.9956991887989282, + "grad_norm": 7.653061389923096, + "learning_rate": 3.3405013520017865e-05, + "loss": 2.8623, + "step": 3203000 + }, + { + "epoch": 0.995854621079415, + "grad_norm": 8.158596992492676, + "learning_rate": 3.340242298200975e-05, + "loss": 2.9257, + "step": 3203500 + }, + { + "epoch": 0.9960100533599019, + "grad_norm": 9.228448867797852, + "learning_rate": 3.339983244400164e-05, + "loss": 2.9088, + "step": 3204000 + }, + { + "epoch": 0.9961654856403888, + "grad_norm": 8.720988273620605, + "learning_rate": 3.339724190599352e-05, + "loss": 2.8421, + "step": 3204500 + }, + { + "epoch": 0.9963209179208756, + "grad_norm": 14.311006546020508, + "learning_rate": 3.339465136798541e-05, + "loss": 2.8742, + "step": 3205000 + }, + { + "epoch": 0.9964763502013625, + "grad_norm": 8.385830879211426, + "learning_rate": 3.3392060829977294e-05, + "loss": 2.853, + "step": 3205500 + }, + { + "epoch": 0.9966317824818494, + "grad_norm": 9.005325317382812, + "learning_rate": 3.3389470291969174e-05, + "loss": 2.9136, + "step": 3206000 + }, + { + "epoch": 0.9967872147623362, + "grad_norm": 8.210746765136719, + "learning_rate": 3.338687975396106e-05, + "loss": 2.8954, + "step": 3206500 + }, + { + "epoch": 0.9969426470428231, + "grad_norm": 7.291170597076416, + "learning_rate": 3.338428921595295e-05, + "loss": 2.8447, + "step": 3207000 + }, + { + "epoch": 0.9970980793233101, + "grad_norm": 9.226125717163086, + "learning_rate": 3.3381698677944836e-05, + "loss": 2.8808, + "step": 3207500 + }, + { + "epoch": 0.9972535116037969, + "grad_norm": 9.096026420593262, + "learning_rate": 3.337910813993672e-05, + "loss": 2.8744, + "step": 3208000 + }, + { + "epoch": 0.9974089438842838, + "grad_norm": 10.841723442077637, + "learning_rate": 3.3376517601928603e-05, + "loss": 2.8738, + "step": 3208500 + }, + { + "epoch": 0.9975643761647707, + "grad_norm": 10.005516052246094, + "learning_rate": 3.337392706392049e-05, + "loss": 2.8993, + "step": 3209000 + }, + { + "epoch": 0.9977198084452575, + "grad_norm": 8.86292552947998, + "learning_rate": 3.337133652591238e-05, + "loss": 2.8918, + "step": 3209500 + }, + { + "epoch": 0.9978752407257444, + "grad_norm": 7.765566349029541, + "learning_rate": 3.336874598790426e-05, + "loss": 2.8665, + "step": 3210000 + }, + { + "epoch": 0.9980306730062313, + "grad_norm": 10.29499626159668, + "learning_rate": 3.3366155449896145e-05, + "loss": 2.8349, + "step": 3210500 + }, + { + "epoch": 0.9981861052867181, + "grad_norm": 9.10944938659668, + "learning_rate": 3.336356491188803e-05, + "loss": 2.842, + "step": 3211000 + }, + { + "epoch": 0.998341537567205, + "grad_norm": 8.109428405761719, + "learning_rate": 3.336097437387992e-05, + "loss": 2.8854, + "step": 3211500 + }, + { + "epoch": 0.9984969698476919, + "grad_norm": 9.575773239135742, + "learning_rate": 3.335838383587181e-05, + "loss": 2.8744, + "step": 3212000 + }, + { + "epoch": 0.9986524021281787, + "grad_norm": 8.908020973205566, + "learning_rate": 3.3355793297863694e-05, + "loss": 2.8552, + "step": 3212500 + }, + { + "epoch": 0.9988078344086656, + "grad_norm": 10.201581954956055, + "learning_rate": 3.3353202759855574e-05, + "loss": 2.8536, + "step": 3213000 + }, + { + "epoch": 0.9989632666891526, + "grad_norm": 7.84602165222168, + "learning_rate": 3.335061222184746e-05, + "loss": 2.883, + "step": 3213500 + }, + { + "epoch": 0.9991186989696395, + "grad_norm": 9.141169548034668, + "learning_rate": 3.334802168383934e-05, + "loss": 2.8681, + "step": 3214000 + }, + { + "epoch": 0.9992741312501263, + "grad_norm": 13.153119087219238, + "learning_rate": 3.334543114583123e-05, + "loss": 2.8992, + "step": 3214500 + }, + { + "epoch": 0.9994295635306132, + "grad_norm": 8.618964195251465, + "learning_rate": 3.3342840607823116e-05, + "loss": 2.8665, + "step": 3215000 + }, + { + "epoch": 0.9995849958111, + "grad_norm": 7.099290370941162, + "learning_rate": 3.3340250069814996e-05, + "loss": 2.9039, + "step": 3215500 + }, + { + "epoch": 0.9997404280915869, + "grad_norm": 12.091672897338867, + "learning_rate": 3.3337659531806884e-05, + "loss": 2.8824, + "step": 3216000 + }, + { + "epoch": 0.9998958603720738, + "grad_norm": 7.586438179016113, + "learning_rate": 3.333506899379877e-05, + "loss": 2.8591, + "step": 3216500 + }, + { + "epoch": 1.0000512926525607, + "grad_norm": 8.492100715637207, + "learning_rate": 3.333247845579066e-05, + "loss": 2.9074, + "step": 3217000 + }, + { + "epoch": 1.0002067249330475, + "grad_norm": 9.553727149963379, + "learning_rate": 3.3329887917782545e-05, + "loss": 2.812, + "step": 3217500 + }, + { + "epoch": 1.0003621572135344, + "grad_norm": 11.912084579467773, + "learning_rate": 3.332729737977443e-05, + "loss": 2.88, + "step": 3218000 + }, + { + "epoch": 1.0005175894940213, + "grad_norm": 8.711150169372559, + "learning_rate": 3.332470684176631e-05, + "loss": 2.8789, + "step": 3218500 + }, + { + "epoch": 1.0006730217745081, + "grad_norm": 14.218607902526855, + "learning_rate": 3.33221163037582e-05, + "loss": 2.84, + "step": 3219000 + }, + { + "epoch": 1.000828454054995, + "grad_norm": 9.714043617248535, + "learning_rate": 3.331952576575008e-05, + "loss": 2.8579, + "step": 3219500 + }, + { + "epoch": 1.0009838863354819, + "grad_norm": 9.918449401855469, + "learning_rate": 3.331693522774197e-05, + "loss": 2.9187, + "step": 3220000 + }, + { + "epoch": 1.0011393186159687, + "grad_norm": 8.73275089263916, + "learning_rate": 3.3314344689733854e-05, + "loss": 2.866, + "step": 3220500 + }, + { + "epoch": 1.0012947508964556, + "grad_norm": 8.518253326416016, + "learning_rate": 3.3311754151725735e-05, + "loss": 2.8244, + "step": 3221000 + }, + { + "epoch": 1.0014501831769425, + "grad_norm": 10.634537696838379, + "learning_rate": 3.330916361371763e-05, + "loss": 2.8255, + "step": 3221500 + }, + { + "epoch": 1.0016056154574293, + "grad_norm": 10.323773384094238, + "learning_rate": 3.3306573075709516e-05, + "loss": 2.89, + "step": 3222000 + }, + { + "epoch": 1.0017610477379164, + "grad_norm": 8.016762733459473, + "learning_rate": 3.3303982537701396e-05, + "loss": 2.8487, + "step": 3222500 + }, + { + "epoch": 1.0019164800184033, + "grad_norm": 10.349032402038574, + "learning_rate": 3.3301391999693283e-05, + "loss": 2.8347, + "step": 3223000 + }, + { + "epoch": 1.0020719122988901, + "grad_norm": 10.08687686920166, + "learning_rate": 3.329880146168517e-05, + "loss": 2.8444, + "step": 3223500 + }, + { + "epoch": 1.002227344579377, + "grad_norm": 8.429219245910645, + "learning_rate": 3.329621092367705e-05, + "loss": 2.8479, + "step": 3224000 + }, + { + "epoch": 1.0023827768598639, + "grad_norm": 12.93928050994873, + "learning_rate": 3.329362038566894e-05, + "loss": 2.8637, + "step": 3224500 + }, + { + "epoch": 1.0025382091403507, + "grad_norm": 9.4644775390625, + "learning_rate": 3.3291029847660825e-05, + "loss": 2.8414, + "step": 3225000 + }, + { + "epoch": 1.0026936414208376, + "grad_norm": 33.554779052734375, + "learning_rate": 3.3288439309652706e-05, + "loss": 2.8829, + "step": 3225500 + }, + { + "epoch": 1.0028490737013245, + "grad_norm": 13.752250671386719, + "learning_rate": 3.328584877164459e-05, + "loss": 2.819, + "step": 3226000 + }, + { + "epoch": 1.0030045059818113, + "grad_norm": 8.017568588256836, + "learning_rate": 3.328325823363648e-05, + "loss": 2.8949, + "step": 3226500 + }, + { + "epoch": 1.0031599382622982, + "grad_norm": 9.297319412231445, + "learning_rate": 3.328066769562837e-05, + "loss": 2.8843, + "step": 3227000 + }, + { + "epoch": 1.003315370542785, + "grad_norm": 9.424711227416992, + "learning_rate": 3.3278077157620254e-05, + "loss": 2.8429, + "step": 3227500 + }, + { + "epoch": 1.003470802823272, + "grad_norm": 11.11260986328125, + "learning_rate": 3.3275486619612135e-05, + "loss": 2.8771, + "step": 3228000 + }, + { + "epoch": 1.0036262351037588, + "grad_norm": 8.17729663848877, + "learning_rate": 3.327289608160402e-05, + "loss": 2.8803, + "step": 3228500 + }, + { + "epoch": 1.0037816673842457, + "grad_norm": 14.094032287597656, + "learning_rate": 3.327030554359591e-05, + "loss": 2.8784, + "step": 3229000 + }, + { + "epoch": 1.0039370996647325, + "grad_norm": 8.543367385864258, + "learning_rate": 3.326771500558779e-05, + "loss": 2.9194, + "step": 3229500 + }, + { + "epoch": 1.0040925319452194, + "grad_norm": 6.035676956176758, + "learning_rate": 3.3265124467579676e-05, + "loss": 2.8412, + "step": 3230000 + }, + { + "epoch": 1.0042479642257063, + "grad_norm": 4.7639594078063965, + "learning_rate": 3.3262533929571564e-05, + "loss": 2.8736, + "step": 3230500 + }, + { + "epoch": 1.0044033965061931, + "grad_norm": 15.2398681640625, + "learning_rate": 3.3259943391563444e-05, + "loss": 2.8498, + "step": 3231000 + }, + { + "epoch": 1.00455882878668, + "grad_norm": 12.206778526306152, + "learning_rate": 3.325735285355534e-05, + "loss": 2.882, + "step": 3231500 + }, + { + "epoch": 1.0047142610671669, + "grad_norm": 8.608509063720703, + "learning_rate": 3.325476231554722e-05, + "loss": 2.8991, + "step": 3232000 + }, + { + "epoch": 1.0048696933476537, + "grad_norm": 9.137762069702148, + "learning_rate": 3.3252171777539105e-05, + "loss": 2.8503, + "step": 3232500 + }, + { + "epoch": 1.0050251256281406, + "grad_norm": 8.000876426696777, + "learning_rate": 3.324958123953099e-05, + "loss": 2.8842, + "step": 3233000 + }, + { + "epoch": 1.0051805579086275, + "grad_norm": 17.593263626098633, + "learning_rate": 3.324699070152287e-05, + "loss": 2.9005, + "step": 3233500 + }, + { + "epoch": 1.0053359901891143, + "grad_norm": 7.229274749755859, + "learning_rate": 3.324440016351476e-05, + "loss": 2.8589, + "step": 3234000 + }, + { + "epoch": 1.0054914224696014, + "grad_norm": 8.932969093322754, + "learning_rate": 3.324180962550665e-05, + "loss": 2.8536, + "step": 3234500 + }, + { + "epoch": 1.0056468547500883, + "grad_norm": 10.433249473571777, + "learning_rate": 3.323921908749853e-05, + "loss": 2.8651, + "step": 3235000 + }, + { + "epoch": 1.0058022870305752, + "grad_norm": 7.401496410369873, + "learning_rate": 3.3236628549490415e-05, + "loss": 2.8905, + "step": 3235500 + }, + { + "epoch": 1.005957719311062, + "grad_norm": 9.534801483154297, + "learning_rate": 3.32340380114823e-05, + "loss": 2.8632, + "step": 3236000 + }, + { + "epoch": 1.006113151591549, + "grad_norm": 7.474560260772705, + "learning_rate": 3.323144747347419e-05, + "loss": 2.8269, + "step": 3236500 + }, + { + "epoch": 1.0062685838720358, + "grad_norm": 9.414377212524414, + "learning_rate": 3.3228856935466076e-05, + "loss": 2.9004, + "step": 3237000 + }, + { + "epoch": 1.0064240161525226, + "grad_norm": 12.09658432006836, + "learning_rate": 3.322626639745796e-05, + "loss": 2.8684, + "step": 3237500 + }, + { + "epoch": 1.0065794484330095, + "grad_norm": 7.655840873718262, + "learning_rate": 3.3223675859449844e-05, + "loss": 2.9047, + "step": 3238000 + }, + { + "epoch": 1.0067348807134964, + "grad_norm": 9.756186485290527, + "learning_rate": 3.322108532144173e-05, + "loss": 2.8602, + "step": 3238500 + }, + { + "epoch": 1.0068903129939832, + "grad_norm": 11.773582458496094, + "learning_rate": 3.321849478343361e-05, + "loss": 2.8526, + "step": 3239000 + }, + { + "epoch": 1.00704574527447, + "grad_norm": 27.821308135986328, + "learning_rate": 3.32159042454255e-05, + "loss": 2.8588, + "step": 3239500 + }, + { + "epoch": 1.007201177554957, + "grad_norm": 10.586666107177734, + "learning_rate": 3.3213313707417386e-05, + "loss": 2.8782, + "step": 3240000 + }, + { + "epoch": 1.0073566098354438, + "grad_norm": 11.974910736083984, + "learning_rate": 3.3210723169409266e-05, + "loss": 2.8724, + "step": 3240500 + }, + { + "epoch": 1.0075120421159307, + "grad_norm": 8.927606582641602, + "learning_rate": 3.320813263140115e-05, + "loss": 2.795, + "step": 3241000 + }, + { + "epoch": 1.0076674743964176, + "grad_norm": 6.625481128692627, + "learning_rate": 3.320554209339305e-05, + "loss": 2.894, + "step": 3241500 + }, + { + "epoch": 1.0078229066769044, + "grad_norm": 9.589674949645996, + "learning_rate": 3.320295155538493e-05, + "loss": 2.8611, + "step": 3242000 + }, + { + "epoch": 1.0079783389573913, + "grad_norm": 9.927062034606934, + "learning_rate": 3.3200361017376815e-05, + "loss": 2.8818, + "step": 3242500 + }, + { + "epoch": 1.0081337712378782, + "grad_norm": 11.347478866577148, + "learning_rate": 3.31977704793687e-05, + "loss": 2.904, + "step": 3243000 + }, + { + "epoch": 1.008289203518365, + "grad_norm": 9.556273460388184, + "learning_rate": 3.319517994136058e-05, + "loss": 2.9205, + "step": 3243500 + }, + { + "epoch": 1.008444635798852, + "grad_norm": 11.697050094604492, + "learning_rate": 3.319258940335247e-05, + "loss": 2.8842, + "step": 3244000 + }, + { + "epoch": 1.0086000680793388, + "grad_norm": 8.285707473754883, + "learning_rate": 3.318999886534435e-05, + "loss": 2.8906, + "step": 3244500 + }, + { + "epoch": 1.0087555003598256, + "grad_norm": 20.505664825439453, + "learning_rate": 3.318740832733624e-05, + "loss": 2.8405, + "step": 3245000 + }, + { + "epoch": 1.0089109326403125, + "grad_norm": 9.134366035461426, + "learning_rate": 3.3184817789328124e-05, + "loss": 2.8978, + "step": 3245500 + }, + { + "epoch": 1.0090663649207994, + "grad_norm": 7.535894870758057, + "learning_rate": 3.318222725132001e-05, + "loss": 2.8498, + "step": 3246000 + }, + { + "epoch": 1.0092217972012865, + "grad_norm": 13.323699951171875, + "learning_rate": 3.31796367133119e-05, + "loss": 2.8983, + "step": 3246500 + }, + { + "epoch": 1.0093772294817733, + "grad_norm": 19.50446891784668, + "learning_rate": 3.3177046175303785e-05, + "loss": 2.8962, + "step": 3247000 + }, + { + "epoch": 1.0095326617622602, + "grad_norm": 16.98702049255371, + "learning_rate": 3.3174455637295666e-05, + "loss": 2.8419, + "step": 3247500 + }, + { + "epoch": 1.009688094042747, + "grad_norm": 21.687198638916016, + "learning_rate": 3.317186509928755e-05, + "loss": 2.8809, + "step": 3248000 + }, + { + "epoch": 1.009843526323234, + "grad_norm": 7.772566795349121, + "learning_rate": 3.316927456127944e-05, + "loss": 2.8795, + "step": 3248500 + }, + { + "epoch": 1.0099989586037208, + "grad_norm": 11.223997116088867, + "learning_rate": 3.316668402327132e-05, + "loss": 2.8396, + "step": 3249000 + }, + { + "epoch": 1.0101543908842077, + "grad_norm": 7.6307806968688965, + "learning_rate": 3.316409348526321e-05, + "loss": 2.9116, + "step": 3249500 + }, + { + "epoch": 1.0103098231646945, + "grad_norm": 9.129250526428223, + "learning_rate": 3.316150294725509e-05, + "loss": 2.8601, + "step": 3250000 + }, + { + "epoch": 1.0104652554451814, + "grad_norm": 9.599295616149902, + "learning_rate": 3.3158912409246975e-05, + "loss": 2.8611, + "step": 3250500 + }, + { + "epoch": 1.0106206877256683, + "grad_norm": 8.483431816101074, + "learning_rate": 3.315632187123886e-05, + "loss": 2.854, + "step": 3251000 + }, + { + "epoch": 1.0107761200061551, + "grad_norm": 20.334983825683594, + "learning_rate": 3.315373133323075e-05, + "loss": 2.8608, + "step": 3251500 + }, + { + "epoch": 1.010931552286642, + "grad_norm": 10.229543685913086, + "learning_rate": 3.315114079522264e-05, + "loss": 2.8453, + "step": 3252000 + }, + { + "epoch": 1.0110869845671289, + "grad_norm": 9.783756256103516, + "learning_rate": 3.3148550257214524e-05, + "loss": 2.881, + "step": 3252500 + }, + { + "epoch": 1.0112424168476157, + "grad_norm": 9.429359436035156, + "learning_rate": 3.3145959719206404e-05, + "loss": 2.8165, + "step": 3253000 + }, + { + "epoch": 1.0113978491281026, + "grad_norm": 16.437156677246094, + "learning_rate": 3.314336918119829e-05, + "loss": 2.8473, + "step": 3253500 + }, + { + "epoch": 1.0115532814085895, + "grad_norm": 13.06186294555664, + "learning_rate": 3.314077864319018e-05, + "loss": 2.8882, + "step": 3254000 + }, + { + "epoch": 1.0117087136890763, + "grad_norm": 10.150651931762695, + "learning_rate": 3.313818810518206e-05, + "loss": 2.8698, + "step": 3254500 + }, + { + "epoch": 1.0118641459695632, + "grad_norm": 9.38198184967041, + "learning_rate": 3.3135597567173946e-05, + "loss": 2.8736, + "step": 3255000 + }, + { + "epoch": 1.01201957825005, + "grad_norm": 7.4972310066223145, + "learning_rate": 3.313300702916583e-05, + "loss": 2.874, + "step": 3255500 + }, + { + "epoch": 1.012175010530537, + "grad_norm": 10.082548141479492, + "learning_rate": 3.313041649115772e-05, + "loss": 2.9299, + "step": 3256000 + }, + { + "epoch": 1.0123304428110238, + "grad_norm": 9.289071083068848, + "learning_rate": 3.312782595314961e-05, + "loss": 2.8533, + "step": 3256500 + }, + { + "epoch": 1.0124858750915107, + "grad_norm": 8.774259567260742, + "learning_rate": 3.312523541514149e-05, + "loss": 2.863, + "step": 3257000 + }, + { + "epoch": 1.0126413073719975, + "grad_norm": 5.553544998168945, + "learning_rate": 3.3122644877133375e-05, + "loss": 2.8965, + "step": 3257500 + }, + { + "epoch": 1.0127967396524844, + "grad_norm": 15.58204174041748, + "learning_rate": 3.312005433912526e-05, + "loss": 2.8675, + "step": 3258000 + }, + { + "epoch": 1.0129521719329715, + "grad_norm": 7.641750812530518, + "learning_rate": 3.311746380111714e-05, + "loss": 2.8691, + "step": 3258500 + }, + { + "epoch": 1.0131076042134584, + "grad_norm": 7.584015846252441, + "learning_rate": 3.311487326310903e-05, + "loss": 2.8451, + "step": 3259000 + }, + { + "epoch": 1.0132630364939452, + "grad_norm": 8.82815933227539, + "learning_rate": 3.311228272510092e-05, + "loss": 2.8384, + "step": 3259500 + }, + { + "epoch": 1.013418468774432, + "grad_norm": 7.424738883972168, + "learning_rate": 3.31096921870928e-05, + "loss": 2.8907, + "step": 3260000 + }, + { + "epoch": 1.013573901054919, + "grad_norm": 15.421759605407715, + "learning_rate": 3.3107101649084684e-05, + "loss": 2.8888, + "step": 3260500 + }, + { + "epoch": 1.0137293333354058, + "grad_norm": 9.375105857849121, + "learning_rate": 3.310451111107657e-05, + "loss": 2.8459, + "step": 3261000 + }, + { + "epoch": 1.0138847656158927, + "grad_norm": 9.214042663574219, + "learning_rate": 3.310192057306846e-05, + "loss": 2.8428, + "step": 3261500 + }, + { + "epoch": 1.0140401978963796, + "grad_norm": 15.832090377807617, + "learning_rate": 3.3099330035060346e-05, + "loss": 2.8626, + "step": 3262000 + }, + { + "epoch": 1.0141956301768664, + "grad_norm": 12.878747940063477, + "learning_rate": 3.3096739497052226e-05, + "loss": 2.9001, + "step": 3262500 + }, + { + "epoch": 1.0143510624573533, + "grad_norm": 8.497390747070312, + "learning_rate": 3.309414895904411e-05, + "loss": 2.8635, + "step": 3263000 + }, + { + "epoch": 1.0145064947378402, + "grad_norm": 7.0437912940979, + "learning_rate": 3.3091558421036e-05, + "loss": 2.8273, + "step": 3263500 + }, + { + "epoch": 1.014661927018327, + "grad_norm": 84.82500457763672, + "learning_rate": 3.308896788302788e-05, + "loss": 2.8618, + "step": 3264000 + }, + { + "epoch": 1.014817359298814, + "grad_norm": 8.047569274902344, + "learning_rate": 3.308637734501977e-05, + "loss": 2.8653, + "step": 3264500 + }, + { + "epoch": 1.0149727915793008, + "grad_norm": 11.263128280639648, + "learning_rate": 3.3083786807011655e-05, + "loss": 2.8644, + "step": 3265000 + }, + { + "epoch": 1.0151282238597876, + "grad_norm": 6.924587726593018, + "learning_rate": 3.308119626900354e-05, + "loss": 2.8708, + "step": 3265500 + }, + { + "epoch": 1.0152836561402745, + "grad_norm": 8.368556022644043, + "learning_rate": 3.307860573099543e-05, + "loss": 2.8833, + "step": 3266000 + }, + { + "epoch": 1.0154390884207614, + "grad_norm": 9.474859237670898, + "learning_rate": 3.307601519298732e-05, + "loss": 2.8657, + "step": 3266500 + }, + { + "epoch": 1.0155945207012482, + "grad_norm": 8.33617877960205, + "learning_rate": 3.30734246549792e-05, + "loss": 2.9223, + "step": 3267000 + }, + { + "epoch": 1.015749952981735, + "grad_norm": 10.943012237548828, + "learning_rate": 3.3070834116971084e-05, + "loss": 2.8713, + "step": 3267500 + }, + { + "epoch": 1.015905385262222, + "grad_norm": 7.800692081451416, + "learning_rate": 3.3068243578962965e-05, + "loss": 2.8584, + "step": 3268000 + }, + { + "epoch": 1.0160608175427088, + "grad_norm": 7.792161464691162, + "learning_rate": 3.306565304095485e-05, + "loss": 2.8778, + "step": 3268500 + }, + { + "epoch": 1.0162162498231957, + "grad_norm": 9.458125114440918, + "learning_rate": 3.306306250294674e-05, + "loss": 2.8847, + "step": 3269000 + }, + { + "epoch": 1.0163716821036826, + "grad_norm": 9.381505966186523, + "learning_rate": 3.306047196493862e-05, + "loss": 2.8642, + "step": 3269500 + }, + { + "epoch": 1.0165271143841694, + "grad_norm": 6.20938777923584, + "learning_rate": 3.3057881426930506e-05, + "loss": 2.8559, + "step": 3270000 + }, + { + "epoch": 1.0166825466646565, + "grad_norm": 8.825209617614746, + "learning_rate": 3.3055290888922394e-05, + "loss": 2.9164, + "step": 3270500 + }, + { + "epoch": 1.0168379789451434, + "grad_norm": 9.404807090759277, + "learning_rate": 3.305270035091428e-05, + "loss": 2.8616, + "step": 3271000 + }, + { + "epoch": 1.0169934112256303, + "grad_norm": 7.542266368865967, + "learning_rate": 3.305010981290617e-05, + "loss": 2.858, + "step": 3271500 + }, + { + "epoch": 1.0171488435061171, + "grad_norm": 9.330745697021484, + "learning_rate": 3.3047519274898055e-05, + "loss": 2.8604, + "step": 3272000 + }, + { + "epoch": 1.017304275786604, + "grad_norm": 7.384363651275635, + "learning_rate": 3.3044928736889935e-05, + "loss": 2.8867, + "step": 3272500 + }, + { + "epoch": 1.0174597080670909, + "grad_norm": 7.732290744781494, + "learning_rate": 3.304233819888182e-05, + "loss": 2.9333, + "step": 3273000 + }, + { + "epoch": 1.0176151403475777, + "grad_norm": 10.974017143249512, + "learning_rate": 3.30397476608737e-05, + "loss": 2.8709, + "step": 3273500 + }, + { + "epoch": 1.0177705726280646, + "grad_norm": 10.508191108703613, + "learning_rate": 3.303715712286559e-05, + "loss": 2.8349, + "step": 3274000 + }, + { + "epoch": 1.0179260049085515, + "grad_norm": 27.66216468811035, + "learning_rate": 3.303456658485748e-05, + "loss": 2.8716, + "step": 3274500 + }, + { + "epoch": 1.0180814371890383, + "grad_norm": 9.453500747680664, + "learning_rate": 3.3031976046849364e-05, + "loss": 2.8695, + "step": 3275000 + }, + { + "epoch": 1.0182368694695252, + "grad_norm": 9.312958717346191, + "learning_rate": 3.302938550884125e-05, + "loss": 2.8646, + "step": 3275500 + }, + { + "epoch": 1.018392301750012, + "grad_norm": 23.38147735595703, + "learning_rate": 3.302679497083314e-05, + "loss": 2.8899, + "step": 3276000 + }, + { + "epoch": 1.018547734030499, + "grad_norm": 9.67337417602539, + "learning_rate": 3.302420443282502e-05, + "loss": 2.8834, + "step": 3276500 + }, + { + "epoch": 1.0187031663109858, + "grad_norm": 51.78237533569336, + "learning_rate": 3.3021613894816906e-05, + "loss": 2.8405, + "step": 3277000 + }, + { + "epoch": 1.0188585985914727, + "grad_norm": 14.427282333374023, + "learning_rate": 3.301902335680879e-05, + "loss": 2.8714, + "step": 3277500 + }, + { + "epoch": 1.0190140308719595, + "grad_norm": 8.916425704956055, + "learning_rate": 3.3016432818800674e-05, + "loss": 2.9044, + "step": 3278000 + }, + { + "epoch": 1.0191694631524464, + "grad_norm": 8.718879699707031, + "learning_rate": 3.301384228079256e-05, + "loss": 2.8017, + "step": 3278500 + }, + { + "epoch": 1.0193248954329333, + "grad_norm": 9.594842910766602, + "learning_rate": 3.301125174278445e-05, + "loss": 2.8839, + "step": 3279000 + }, + { + "epoch": 1.0194803277134201, + "grad_norm": 28.268537521362305, + "learning_rate": 3.300866120477633e-05, + "loss": 2.9054, + "step": 3279500 + }, + { + "epoch": 1.019635759993907, + "grad_norm": 9.431594848632812, + "learning_rate": 3.3006070666768216e-05, + "loss": 2.8734, + "step": 3280000 + }, + { + "epoch": 1.0197911922743939, + "grad_norm": 8.034910202026367, + "learning_rate": 3.30034801287601e-05, + "loss": 2.8788, + "step": 3280500 + }, + { + "epoch": 1.0199466245548807, + "grad_norm": 9.531851768493652, + "learning_rate": 3.300088959075199e-05, + "loss": 2.9109, + "step": 3281000 + }, + { + "epoch": 1.0201020568353676, + "grad_norm": 9.788823127746582, + "learning_rate": 3.299829905274388e-05, + "loss": 2.8532, + "step": 3281500 + }, + { + "epoch": 1.0202574891158545, + "grad_norm": 89.79019165039062, + "learning_rate": 3.299570851473576e-05, + "loss": 2.8613, + "step": 3282000 + }, + { + "epoch": 1.0204129213963415, + "grad_norm": 11.221163749694824, + "learning_rate": 3.2993117976727645e-05, + "loss": 2.8633, + "step": 3282500 + }, + { + "epoch": 1.0205683536768284, + "grad_norm": 8.07005786895752, + "learning_rate": 3.299052743871953e-05, + "loss": 2.8504, + "step": 3283000 + }, + { + "epoch": 1.0207237859573153, + "grad_norm": 9.15805721282959, + "learning_rate": 3.298793690071141e-05, + "loss": 2.8935, + "step": 3283500 + }, + { + "epoch": 1.0208792182378021, + "grad_norm": 9.98305606842041, + "learning_rate": 3.29853463627033e-05, + "loss": 2.8641, + "step": 3284000 + }, + { + "epoch": 1.021034650518289, + "grad_norm": 7.5928955078125, + "learning_rate": 3.2982755824695186e-05, + "loss": 2.8373, + "step": 3284500 + }, + { + "epoch": 1.0211900827987759, + "grad_norm": 11.066377639770508, + "learning_rate": 3.2980165286687074e-05, + "loss": 2.8528, + "step": 3285000 + }, + { + "epoch": 1.0213455150792627, + "grad_norm": 13.54810905456543, + "learning_rate": 3.297757474867896e-05, + "loss": 2.8814, + "step": 3285500 + }, + { + "epoch": 1.0215009473597496, + "grad_norm": 7.727709770202637, + "learning_rate": 3.297498421067084e-05, + "loss": 2.8751, + "step": 3286000 + }, + { + "epoch": 1.0216563796402365, + "grad_norm": 11.480743408203125, + "learning_rate": 3.297239367266273e-05, + "loss": 2.878, + "step": 3286500 + }, + { + "epoch": 1.0218118119207233, + "grad_norm": 8.23007583618164, + "learning_rate": 3.2969803134654615e-05, + "loss": 2.8565, + "step": 3287000 + }, + { + "epoch": 1.0219672442012102, + "grad_norm": 7.8557963371276855, + "learning_rate": 3.2967212596646496e-05, + "loss": 2.8786, + "step": 3287500 + }, + { + "epoch": 1.022122676481697, + "grad_norm": 9.84073543548584, + "learning_rate": 3.296462205863838e-05, + "loss": 2.849, + "step": 3288000 + }, + { + "epoch": 1.022278108762184, + "grad_norm": 7.330102920532227, + "learning_rate": 3.296203152063027e-05, + "loss": 2.8736, + "step": 3288500 + }, + { + "epoch": 1.0224335410426708, + "grad_norm": 10.211913108825684, + "learning_rate": 3.295944098262215e-05, + "loss": 2.8648, + "step": 3289000 + }, + { + "epoch": 1.0225889733231577, + "grad_norm": 7.422497749328613, + "learning_rate": 3.295685044461404e-05, + "loss": 2.8993, + "step": 3289500 + }, + { + "epoch": 1.0227444056036445, + "grad_norm": 9.681943893432617, + "learning_rate": 3.2954259906605925e-05, + "loss": 2.8488, + "step": 3290000 + }, + { + "epoch": 1.0228998378841314, + "grad_norm": 10.695550918579102, + "learning_rate": 3.295166936859781e-05, + "loss": 2.8279, + "step": 3290500 + }, + { + "epoch": 1.0230552701646183, + "grad_norm": 9.763370513916016, + "learning_rate": 3.29490788305897e-05, + "loss": 2.9268, + "step": 3291000 + }, + { + "epoch": 1.0232107024451051, + "grad_norm": 11.420149803161621, + "learning_rate": 3.294648829258158e-05, + "loss": 2.8857, + "step": 3291500 + }, + { + "epoch": 1.023366134725592, + "grad_norm": 7.8827104568481445, + "learning_rate": 3.294389775457347e-05, + "loss": 2.904, + "step": 3292000 + }, + { + "epoch": 1.0235215670060789, + "grad_norm": 8.196221351623535, + "learning_rate": 3.2941307216565354e-05, + "loss": 2.8847, + "step": 3292500 + }, + { + "epoch": 1.0236769992865657, + "grad_norm": 11.166979789733887, + "learning_rate": 3.2938716678557234e-05, + "loss": 2.873, + "step": 3293000 + }, + { + "epoch": 1.0238324315670526, + "grad_norm": 8.654476165771484, + "learning_rate": 3.293612614054912e-05, + "loss": 2.906, + "step": 3293500 + }, + { + "epoch": 1.0239878638475395, + "grad_norm": 7.556258678436279, + "learning_rate": 3.293353560254101e-05, + "loss": 2.8812, + "step": 3294000 + }, + { + "epoch": 1.0241432961280266, + "grad_norm": 12.857309341430664, + "learning_rate": 3.293094506453289e-05, + "loss": 2.8603, + "step": 3294500 + }, + { + "epoch": 1.0242987284085134, + "grad_norm": 7.802764892578125, + "learning_rate": 3.292835452652478e-05, + "loss": 2.851, + "step": 3295000 + }, + { + "epoch": 1.0244541606890003, + "grad_norm": 10.7689790725708, + "learning_rate": 3.292576398851667e-05, + "loss": 2.844, + "step": 3295500 + }, + { + "epoch": 1.0246095929694872, + "grad_norm": 12.153553009033203, + "learning_rate": 3.292317345050855e-05, + "loss": 2.8671, + "step": 3296000 + }, + { + "epoch": 1.024765025249974, + "grad_norm": 8.01387882232666, + "learning_rate": 3.292058291250044e-05, + "loss": 2.8355, + "step": 3296500 + }, + { + "epoch": 1.024920457530461, + "grad_norm": 10.451788902282715, + "learning_rate": 3.2917992374492325e-05, + "loss": 2.8451, + "step": 3297000 + }, + { + "epoch": 1.0250758898109478, + "grad_norm": 6.14119291305542, + "learning_rate": 3.2915401836484205e-05, + "loss": 2.8559, + "step": 3297500 + }, + { + "epoch": 1.0252313220914346, + "grad_norm": 13.489344596862793, + "learning_rate": 3.291281129847609e-05, + "loss": 2.8504, + "step": 3298000 + }, + { + "epoch": 1.0253867543719215, + "grad_norm": 9.789743423461914, + "learning_rate": 3.291022076046797e-05, + "loss": 2.8726, + "step": 3298500 + }, + { + "epoch": 1.0255421866524084, + "grad_norm": 9.845291137695312, + "learning_rate": 3.290763022245986e-05, + "loss": 2.897, + "step": 3299000 + }, + { + "epoch": 1.0256976189328952, + "grad_norm": 29.684967041015625, + "learning_rate": 3.290503968445175e-05, + "loss": 2.8906, + "step": 3299500 + }, + { + "epoch": 1.025853051213382, + "grad_norm": 10.191841125488281, + "learning_rate": 3.2902449146443634e-05, + "loss": 2.8523, + "step": 3300000 + }, + { + "epoch": 1.026008483493869, + "grad_norm": 13.463170051574707, + "learning_rate": 3.289985860843552e-05, + "loss": 2.8593, + "step": 3300500 + }, + { + "epoch": 1.0261639157743558, + "grad_norm": 12.682181358337402, + "learning_rate": 3.289726807042741e-05, + "loss": 2.9322, + "step": 3301000 + }, + { + "epoch": 1.0263193480548427, + "grad_norm": 7.942287445068359, + "learning_rate": 3.289467753241929e-05, + "loss": 2.8713, + "step": 3301500 + }, + { + "epoch": 1.0264747803353296, + "grad_norm": 38.37222671508789, + "learning_rate": 3.2892086994411176e-05, + "loss": 2.848, + "step": 3302000 + }, + { + "epoch": 1.0266302126158164, + "grad_norm": 8.21086311340332, + "learning_rate": 3.288949645640306e-05, + "loss": 2.8389, + "step": 3302500 + }, + { + "epoch": 1.0267856448963033, + "grad_norm": 10.336030960083008, + "learning_rate": 3.288690591839494e-05, + "loss": 2.8475, + "step": 3303000 + }, + { + "epoch": 1.0269410771767902, + "grad_norm": 6.431661605834961, + "learning_rate": 3.288431538038683e-05, + "loss": 2.8463, + "step": 3303500 + }, + { + "epoch": 1.027096509457277, + "grad_norm": 9.863065719604492, + "learning_rate": 3.288172484237871e-05, + "loss": 2.827, + "step": 3304000 + }, + { + "epoch": 1.027251941737764, + "grad_norm": 9.950100898742676, + "learning_rate": 3.28791343043706e-05, + "loss": 2.8935, + "step": 3304500 + }, + { + "epoch": 1.0274073740182508, + "grad_norm": 8.723226547241211, + "learning_rate": 3.287654376636249e-05, + "loss": 2.904, + "step": 3305000 + }, + { + "epoch": 1.0275628062987376, + "grad_norm": 11.591903686523438, + "learning_rate": 3.287395322835437e-05, + "loss": 2.8728, + "step": 3305500 + }, + { + "epoch": 1.0277182385792245, + "grad_norm": 10.323410034179688, + "learning_rate": 3.287136269034626e-05, + "loss": 2.9202, + "step": 3306000 + }, + { + "epoch": 1.0278736708597114, + "grad_norm": 10.830419540405273, + "learning_rate": 3.286877215233815e-05, + "loss": 2.8813, + "step": 3306500 + }, + { + "epoch": 1.0280291031401985, + "grad_norm": 9.02690601348877, + "learning_rate": 3.286618161433003e-05, + "loss": 2.8776, + "step": 3307000 + }, + { + "epoch": 1.0281845354206853, + "grad_norm": 8.598843574523926, + "learning_rate": 3.2863591076321914e-05, + "loss": 2.87, + "step": 3307500 + }, + { + "epoch": 1.0283399677011722, + "grad_norm": 7.965668201446533, + "learning_rate": 3.28610005383138e-05, + "loss": 2.9085, + "step": 3308000 + }, + { + "epoch": 1.028495399981659, + "grad_norm": 25.186216354370117, + "learning_rate": 3.285841000030568e-05, + "loss": 2.9163, + "step": 3308500 + }, + { + "epoch": 1.028650832262146, + "grad_norm": 7.628626346588135, + "learning_rate": 3.285581946229757e-05, + "loss": 2.891, + "step": 3309000 + }, + { + "epoch": 1.0288062645426328, + "grad_norm": 6.812699794769287, + "learning_rate": 3.2853228924289456e-05, + "loss": 2.8922, + "step": 3309500 + }, + { + "epoch": 1.0289616968231197, + "grad_norm": 8.224309921264648, + "learning_rate": 3.285063838628134e-05, + "loss": 2.895, + "step": 3310000 + }, + { + "epoch": 1.0291171291036065, + "grad_norm": 9.926901817321777, + "learning_rate": 3.284804784827323e-05, + "loss": 2.875, + "step": 3310500 + }, + { + "epoch": 1.0292725613840934, + "grad_norm": 10.478351593017578, + "learning_rate": 3.284545731026511e-05, + "loss": 2.8494, + "step": 3311000 + }, + { + "epoch": 1.0294279936645803, + "grad_norm": 9.773934364318848, + "learning_rate": 3.2842866772257e-05, + "loss": 2.866, + "step": 3311500 + }, + { + "epoch": 1.0295834259450671, + "grad_norm": 19.20359992980957, + "learning_rate": 3.2840276234248885e-05, + "loss": 2.8065, + "step": 3312000 + }, + { + "epoch": 1.029738858225554, + "grad_norm": 7.2200727462768555, + "learning_rate": 3.2837685696240765e-05, + "loss": 2.8633, + "step": 3312500 + }, + { + "epoch": 1.0298942905060409, + "grad_norm": 7.069849491119385, + "learning_rate": 3.283509515823265e-05, + "loss": 2.8609, + "step": 3313000 + }, + { + "epoch": 1.0300497227865277, + "grad_norm": 19.162700653076172, + "learning_rate": 3.283250462022454e-05, + "loss": 2.8608, + "step": 3313500 + }, + { + "epoch": 1.0302051550670146, + "grad_norm": 8.849251747131348, + "learning_rate": 3.282991408221642e-05, + "loss": 2.8801, + "step": 3314000 + }, + { + "epoch": 1.0303605873475015, + "grad_norm": 10.76231575012207, + "learning_rate": 3.282732354420831e-05, + "loss": 2.8822, + "step": 3314500 + }, + { + "epoch": 1.0305160196279883, + "grad_norm": 11.447799682617188, + "learning_rate": 3.28247330062002e-05, + "loss": 2.821, + "step": 3315000 + }, + { + "epoch": 1.0306714519084752, + "grad_norm": 8.936769485473633, + "learning_rate": 3.282214246819208e-05, + "loss": 2.8785, + "step": 3315500 + }, + { + "epoch": 1.030826884188962, + "grad_norm": 7.538845062255859, + "learning_rate": 3.281955193018397e-05, + "loss": 2.8765, + "step": 3316000 + }, + { + "epoch": 1.030982316469449, + "grad_norm": 10.097002983093262, + "learning_rate": 3.281696139217585e-05, + "loss": 2.8327, + "step": 3316500 + }, + { + "epoch": 1.0311377487499358, + "grad_norm": 9.029407501220703, + "learning_rate": 3.2814370854167736e-05, + "loss": 2.8099, + "step": 3317000 + }, + { + "epoch": 1.0312931810304227, + "grad_norm": 7.743147850036621, + "learning_rate": 3.281178031615962e-05, + "loss": 2.8508, + "step": 3317500 + }, + { + "epoch": 1.0314486133109095, + "grad_norm": 9.6588716506958, + "learning_rate": 3.2809189778151504e-05, + "loss": 2.8528, + "step": 3318000 + }, + { + "epoch": 1.0316040455913966, + "grad_norm": 7.683608531951904, + "learning_rate": 3.280659924014339e-05, + "loss": 2.8413, + "step": 3318500 + }, + { + "epoch": 1.0317594778718835, + "grad_norm": 12.298110008239746, + "learning_rate": 3.280400870213528e-05, + "loss": 2.8678, + "step": 3319000 + }, + { + "epoch": 1.0319149101523704, + "grad_norm": 9.473699569702148, + "learning_rate": 3.2801418164127165e-05, + "loss": 2.8645, + "step": 3319500 + }, + { + "epoch": 1.0320703424328572, + "grad_norm": 8.74713134765625, + "learning_rate": 3.279882762611905e-05, + "loss": 2.9171, + "step": 3320000 + }, + { + "epoch": 1.032225774713344, + "grad_norm": 8.20040512084961, + "learning_rate": 3.279623708811094e-05, + "loss": 2.8454, + "step": 3320500 + }, + { + "epoch": 1.032381206993831, + "grad_norm": 22.003536224365234, + "learning_rate": 3.279364655010282e-05, + "loss": 2.8485, + "step": 3321000 + }, + { + "epoch": 1.0325366392743178, + "grad_norm": 8.69116497039795, + "learning_rate": 3.279105601209471e-05, + "loss": 2.8596, + "step": 3321500 + }, + { + "epoch": 1.0326920715548047, + "grad_norm": 11.200221061706543, + "learning_rate": 3.278846547408659e-05, + "loss": 2.8358, + "step": 3322000 + }, + { + "epoch": 1.0328475038352916, + "grad_norm": 8.605266571044922, + "learning_rate": 3.2785874936078475e-05, + "loss": 2.8062, + "step": 3322500 + }, + { + "epoch": 1.0330029361157784, + "grad_norm": 8.872060775756836, + "learning_rate": 3.278328439807036e-05, + "loss": 2.8723, + "step": 3323000 + }, + { + "epoch": 1.0331583683962653, + "grad_norm": 10.810295104980469, + "learning_rate": 3.278069386006224e-05, + "loss": 2.8953, + "step": 3323500 + }, + { + "epoch": 1.0333138006767522, + "grad_norm": 6.672851085662842, + "learning_rate": 3.277810332205413e-05, + "loss": 2.8752, + "step": 3324000 + }, + { + "epoch": 1.033469232957239, + "grad_norm": 8.25992202758789, + "learning_rate": 3.2775512784046016e-05, + "loss": 2.8624, + "step": 3324500 + }, + { + "epoch": 1.033624665237726, + "grad_norm": 20.113361358642578, + "learning_rate": 3.2772922246037904e-05, + "loss": 2.8556, + "step": 3325000 + }, + { + "epoch": 1.0337800975182128, + "grad_norm": 9.070086479187012, + "learning_rate": 3.277033170802979e-05, + "loss": 2.8597, + "step": 3325500 + }, + { + "epoch": 1.0339355297986996, + "grad_norm": 25.651424407958984, + "learning_rate": 3.276774117002168e-05, + "loss": 2.8587, + "step": 3326000 + }, + { + "epoch": 1.0340909620791865, + "grad_norm": 8.875636100769043, + "learning_rate": 3.276515063201356e-05, + "loss": 2.8809, + "step": 3326500 + }, + { + "epoch": 1.0342463943596734, + "grad_norm": 8.102267265319824, + "learning_rate": 3.2762560094005445e-05, + "loss": 2.837, + "step": 3327000 + }, + { + "epoch": 1.0344018266401602, + "grad_norm": 9.199847221374512, + "learning_rate": 3.275996955599733e-05, + "loss": 2.8326, + "step": 3327500 + }, + { + "epoch": 1.034557258920647, + "grad_norm": 10.6712064743042, + "learning_rate": 3.275737901798921e-05, + "loss": 2.8361, + "step": 3328000 + }, + { + "epoch": 1.034712691201134, + "grad_norm": 9.579446792602539, + "learning_rate": 3.27547884799811e-05, + "loss": 2.8612, + "step": 3328500 + }, + { + "epoch": 1.0348681234816208, + "grad_norm": 17.089048385620117, + "learning_rate": 3.275219794197299e-05, + "loss": 2.8835, + "step": 3329000 + }, + { + "epoch": 1.0350235557621077, + "grad_norm": 9.896514892578125, + "learning_rate": 3.2749607403964874e-05, + "loss": 2.8754, + "step": 3329500 + }, + { + "epoch": 1.0351789880425946, + "grad_norm": 8.126344680786133, + "learning_rate": 3.274701686595676e-05, + "loss": 2.8726, + "step": 3330000 + }, + { + "epoch": 1.0353344203230814, + "grad_norm": 6.562243938446045, + "learning_rate": 3.274442632794864e-05, + "loss": 2.8305, + "step": 3330500 + }, + { + "epoch": 1.0354898526035685, + "grad_norm": 8.842888832092285, + "learning_rate": 3.274183578994053e-05, + "loss": 2.8487, + "step": 3331000 + }, + { + "epoch": 1.0356452848840554, + "grad_norm": 12.274758338928223, + "learning_rate": 3.2739245251932416e-05, + "loss": 2.8546, + "step": 3331500 + }, + { + "epoch": 1.0358007171645423, + "grad_norm": 9.413209915161133, + "learning_rate": 3.2736654713924297e-05, + "loss": 2.8835, + "step": 3332000 + }, + { + "epoch": 1.0359561494450291, + "grad_norm": 7.511793613433838, + "learning_rate": 3.2734064175916184e-05, + "loss": 2.8708, + "step": 3332500 + }, + { + "epoch": 1.036111581725516, + "grad_norm": 8.964974403381348, + "learning_rate": 3.273147363790807e-05, + "loss": 2.8533, + "step": 3333000 + }, + { + "epoch": 1.0362670140060029, + "grad_norm": 8.719696998596191, + "learning_rate": 3.272888309989995e-05, + "loss": 2.9223, + "step": 3333500 + }, + { + "epoch": 1.0364224462864897, + "grad_norm": 9.011259078979492, + "learning_rate": 3.272629256189184e-05, + "loss": 2.8435, + "step": 3334000 + }, + { + "epoch": 1.0365778785669766, + "grad_norm": 7.6226325035095215, + "learning_rate": 3.2723702023883726e-05, + "loss": 2.8699, + "step": 3334500 + }, + { + "epoch": 1.0367333108474635, + "grad_norm": 10.449501037597656, + "learning_rate": 3.272111148587561e-05, + "loss": 2.8918, + "step": 3335000 + }, + { + "epoch": 1.0368887431279503, + "grad_norm": 8.72227954864502, + "learning_rate": 3.27185209478675e-05, + "loss": 2.8554, + "step": 3335500 + }, + { + "epoch": 1.0370441754084372, + "grad_norm": 7.4040398597717285, + "learning_rate": 3.271593040985938e-05, + "loss": 2.8095, + "step": 3336000 + }, + { + "epoch": 1.037199607688924, + "grad_norm": 9.740185737609863, + "learning_rate": 3.271333987185127e-05, + "loss": 2.8535, + "step": 3336500 + }, + { + "epoch": 1.037355039969411, + "grad_norm": 8.732361793518066, + "learning_rate": 3.2710749333843155e-05, + "loss": 2.8776, + "step": 3337000 + }, + { + "epoch": 1.0375104722498978, + "grad_norm": 7.291806697845459, + "learning_rate": 3.2708158795835035e-05, + "loss": 2.9041, + "step": 3337500 + }, + { + "epoch": 1.0376659045303847, + "grad_norm": 11.416768074035645, + "learning_rate": 3.270556825782692e-05, + "loss": 2.8661, + "step": 3338000 + }, + { + "epoch": 1.0378213368108715, + "grad_norm": 6.433290481567383, + "learning_rate": 3.270297771981881e-05, + "loss": 2.8871, + "step": 3338500 + }, + { + "epoch": 1.0379767690913584, + "grad_norm": 8.690683364868164, + "learning_rate": 3.2700387181810696e-05, + "loss": 2.8558, + "step": 3339000 + }, + { + "epoch": 1.0381322013718453, + "grad_norm": 6.796769618988037, + "learning_rate": 3.2697796643802584e-05, + "loss": 2.8735, + "step": 3339500 + }, + { + "epoch": 1.0382876336523321, + "grad_norm": 9.006214141845703, + "learning_rate": 3.2695206105794464e-05, + "loss": 2.8581, + "step": 3340000 + }, + { + "epoch": 1.038443065932819, + "grad_norm": 12.732399940490723, + "learning_rate": 3.269261556778635e-05, + "loss": 2.8436, + "step": 3340500 + }, + { + "epoch": 1.0385984982133059, + "grad_norm": 8.448224067687988, + "learning_rate": 3.269002502977824e-05, + "loss": 2.8479, + "step": 3341000 + }, + { + "epoch": 1.0387539304937927, + "grad_norm": 8.728836059570312, + "learning_rate": 3.268743449177012e-05, + "loss": 2.8126, + "step": 3341500 + }, + { + "epoch": 1.0389093627742796, + "grad_norm": 10.031523704528809, + "learning_rate": 3.2684843953762006e-05, + "loss": 2.8429, + "step": 3342000 + }, + { + "epoch": 1.0390647950547667, + "grad_norm": 13.377091407775879, + "learning_rate": 3.268225341575389e-05, + "loss": 2.8712, + "step": 3342500 + }, + { + "epoch": 1.0392202273352535, + "grad_norm": 9.674070358276367, + "learning_rate": 3.267966287774577e-05, + "loss": 2.8747, + "step": 3343000 + }, + { + "epoch": 1.0393756596157404, + "grad_norm": 19.416210174560547, + "learning_rate": 3.267707233973766e-05, + "loss": 2.9081, + "step": 3343500 + }, + { + "epoch": 1.0395310918962273, + "grad_norm": 7.10913610458374, + "learning_rate": 3.267448180172955e-05, + "loss": 2.8184, + "step": 3344000 + }, + { + "epoch": 1.0396865241767141, + "grad_norm": 7.835684299468994, + "learning_rate": 3.2671891263721435e-05, + "loss": 2.8464, + "step": 3344500 + }, + { + "epoch": 1.039841956457201, + "grad_norm": 11.673162460327148, + "learning_rate": 3.266930072571332e-05, + "loss": 2.8911, + "step": 3345000 + }, + { + "epoch": 1.0399973887376879, + "grad_norm": 7.756838321685791, + "learning_rate": 3.266671018770521e-05, + "loss": 2.8615, + "step": 3345500 + }, + { + "epoch": 1.0401528210181747, + "grad_norm": 6.719972610473633, + "learning_rate": 3.266411964969709e-05, + "loss": 2.8742, + "step": 3346000 + }, + { + "epoch": 1.0403082532986616, + "grad_norm": 7.726101875305176, + "learning_rate": 3.2661529111688977e-05, + "loss": 2.8922, + "step": 3346500 + }, + { + "epoch": 1.0404636855791485, + "grad_norm": 11.285252571105957, + "learning_rate": 3.265893857368086e-05, + "loss": 2.8855, + "step": 3347000 + }, + { + "epoch": 1.0406191178596353, + "grad_norm": 14.48184871673584, + "learning_rate": 3.2656348035672744e-05, + "loss": 2.886, + "step": 3347500 + }, + { + "epoch": 1.0407745501401222, + "grad_norm": 10.784356117248535, + "learning_rate": 3.265375749766463e-05, + "loss": 2.8524, + "step": 3348000 + }, + { + "epoch": 1.040929982420609, + "grad_norm": 10.681642532348633, + "learning_rate": 3.265116695965652e-05, + "loss": 2.8743, + "step": 3348500 + }, + { + "epoch": 1.041085414701096, + "grad_norm": 8.037257194519043, + "learning_rate": 3.2648576421648406e-05, + "loss": 2.8625, + "step": 3349000 + }, + { + "epoch": 1.0412408469815828, + "grad_norm": 8.317471504211426, + "learning_rate": 3.264598588364029e-05, + "loss": 2.8843, + "step": 3349500 + }, + { + "epoch": 1.0413962792620697, + "grad_norm": 15.394457817077637, + "learning_rate": 3.264339534563217e-05, + "loss": 2.8474, + "step": 3350000 + }, + { + "epoch": 1.0415517115425565, + "grad_norm": 8.801002502441406, + "learning_rate": 3.264080480762406e-05, + "loss": 2.8843, + "step": 3350500 + }, + { + "epoch": 1.0417071438230434, + "grad_norm": 9.660317420959473, + "learning_rate": 3.263821426961595e-05, + "loss": 2.8872, + "step": 3351000 + }, + { + "epoch": 1.0418625761035303, + "grad_norm": 7.780256748199463, + "learning_rate": 3.263562373160783e-05, + "loss": 2.8838, + "step": 3351500 + }, + { + "epoch": 1.0420180083840171, + "grad_norm": 11.353142738342285, + "learning_rate": 3.2633033193599715e-05, + "loss": 2.8621, + "step": 3352000 + }, + { + "epoch": 1.042173440664504, + "grad_norm": 6.623621940612793, + "learning_rate": 3.2630442655591595e-05, + "loss": 2.865, + "step": 3352500 + }, + { + "epoch": 1.0423288729449909, + "grad_norm": 12.190373420715332, + "learning_rate": 3.262785211758348e-05, + "loss": 2.8724, + "step": 3353000 + }, + { + "epoch": 1.0424843052254777, + "grad_norm": 14.391398429870605, + "learning_rate": 3.262526157957537e-05, + "loss": 2.8727, + "step": 3353500 + }, + { + "epoch": 1.0426397375059646, + "grad_norm": 15.982589721679688, + "learning_rate": 3.262267104156726e-05, + "loss": 2.8509, + "step": 3354000 + }, + { + "epoch": 1.0427951697864515, + "grad_norm": 7.943883419036865, + "learning_rate": 3.2620080503559144e-05, + "loss": 2.8503, + "step": 3354500 + }, + { + "epoch": 1.0429506020669386, + "grad_norm": 9.610298156738281, + "learning_rate": 3.261748996555103e-05, + "loss": 2.8929, + "step": 3355000 + }, + { + "epoch": 1.0431060343474254, + "grad_norm": 8.762368202209473, + "learning_rate": 3.261489942754291e-05, + "loss": 2.8803, + "step": 3355500 + }, + { + "epoch": 1.0432614666279123, + "grad_norm": 8.665312767028809, + "learning_rate": 3.26123088895348e-05, + "loss": 2.8785, + "step": 3356000 + }, + { + "epoch": 1.0434168989083992, + "grad_norm": 9.044523239135742, + "learning_rate": 3.2609718351526686e-05, + "loss": 2.8705, + "step": 3356500 + }, + { + "epoch": 1.043572331188886, + "grad_norm": 35.056766510009766, + "learning_rate": 3.2607127813518566e-05, + "loss": 2.8482, + "step": 3357000 + }, + { + "epoch": 1.043727763469373, + "grad_norm": 11.186482429504395, + "learning_rate": 3.260453727551045e-05, + "loss": 2.8737, + "step": 3357500 + }, + { + "epoch": 1.0438831957498598, + "grad_norm": 15.18598747253418, + "learning_rate": 3.2601946737502334e-05, + "loss": 2.9003, + "step": 3358000 + }, + { + "epoch": 1.0440386280303466, + "grad_norm": 10.275506973266602, + "learning_rate": 3.259935619949423e-05, + "loss": 2.8606, + "step": 3358500 + }, + { + "epoch": 1.0441940603108335, + "grad_norm": 9.602059364318848, + "learning_rate": 3.2596765661486115e-05, + "loss": 2.9561, + "step": 3359000 + }, + { + "epoch": 1.0443494925913204, + "grad_norm": 10.498069763183594, + "learning_rate": 3.2594175123477995e-05, + "loss": 2.7977, + "step": 3359500 + }, + { + "epoch": 1.0445049248718072, + "grad_norm": 8.530423164367676, + "learning_rate": 3.259158458546988e-05, + "loss": 2.839, + "step": 3360000 + }, + { + "epoch": 1.044660357152294, + "grad_norm": 12.033307075500488, + "learning_rate": 3.258899404746177e-05, + "loss": 2.848, + "step": 3360500 + }, + { + "epoch": 1.044815789432781, + "grad_norm": 7.641107082366943, + "learning_rate": 3.258640350945365e-05, + "loss": 2.8881, + "step": 3361000 + }, + { + "epoch": 1.0449712217132678, + "grad_norm": 10.035137176513672, + "learning_rate": 3.258381297144554e-05, + "loss": 2.8184, + "step": 3361500 + }, + { + "epoch": 1.0451266539937547, + "grad_norm": 5.767294406890869, + "learning_rate": 3.2581222433437424e-05, + "loss": 2.8653, + "step": 3362000 + }, + { + "epoch": 1.0452820862742416, + "grad_norm": 8.684714317321777, + "learning_rate": 3.2578631895429305e-05, + "loss": 2.8951, + "step": 3362500 + }, + { + "epoch": 1.0454375185547284, + "grad_norm": 9.045610427856445, + "learning_rate": 3.257604135742119e-05, + "loss": 2.8666, + "step": 3363000 + }, + { + "epoch": 1.0455929508352153, + "grad_norm": 9.395450592041016, + "learning_rate": 3.257345081941308e-05, + "loss": 2.8418, + "step": 3363500 + }, + { + "epoch": 1.0457483831157022, + "grad_norm": 9.529364585876465, + "learning_rate": 3.2570860281404966e-05, + "loss": 2.8593, + "step": 3364000 + }, + { + "epoch": 1.045903815396189, + "grad_norm": 7.917593002319336, + "learning_rate": 3.256826974339685e-05, + "loss": 2.8302, + "step": 3364500 + }, + { + "epoch": 1.046059247676676, + "grad_norm": 10.421682357788086, + "learning_rate": 3.2565679205388734e-05, + "loss": 2.8599, + "step": 3365000 + }, + { + "epoch": 1.0462146799571628, + "grad_norm": 8.544768333435059, + "learning_rate": 3.256308866738062e-05, + "loss": 2.9205, + "step": 3365500 + }, + { + "epoch": 1.0463701122376496, + "grad_norm": 12.750513076782227, + "learning_rate": 3.256049812937251e-05, + "loss": 2.8221, + "step": 3366000 + }, + { + "epoch": 1.0465255445181367, + "grad_norm": 7.592057704925537, + "learning_rate": 3.255790759136439e-05, + "loss": 2.857, + "step": 3366500 + }, + { + "epoch": 1.0466809767986236, + "grad_norm": 15.359182357788086, + "learning_rate": 3.2555317053356275e-05, + "loss": 2.862, + "step": 3367000 + }, + { + "epoch": 1.0468364090791105, + "grad_norm": 10.464513778686523, + "learning_rate": 3.255272651534816e-05, + "loss": 2.8645, + "step": 3367500 + }, + { + "epoch": 1.0469918413595973, + "grad_norm": 7.326046466827393, + "learning_rate": 3.255013597734004e-05, + "loss": 2.8664, + "step": 3368000 + }, + { + "epoch": 1.0471472736400842, + "grad_norm": 9.262809753417969, + "learning_rate": 3.254754543933194e-05, + "loss": 2.8473, + "step": 3368500 + }, + { + "epoch": 1.047302705920571, + "grad_norm": 18.913654327392578, + "learning_rate": 3.2544954901323824e-05, + "loss": 2.8644, + "step": 3369000 + }, + { + "epoch": 1.047458138201058, + "grad_norm": 9.129252433776855, + "learning_rate": 3.2542364363315704e-05, + "loss": 2.879, + "step": 3369500 + }, + { + "epoch": 1.0476135704815448, + "grad_norm": 10.872483253479004, + "learning_rate": 3.253977382530759e-05, + "loss": 2.865, + "step": 3370000 + }, + { + "epoch": 1.0477690027620317, + "grad_norm": 11.45192813873291, + "learning_rate": 3.253718328729947e-05, + "loss": 2.9286, + "step": 3370500 + }, + { + "epoch": 1.0479244350425185, + "grad_norm": 9.753629684448242, + "learning_rate": 3.253459274929136e-05, + "loss": 2.8605, + "step": 3371000 + }, + { + "epoch": 1.0480798673230054, + "grad_norm": 8.655069351196289, + "learning_rate": 3.2532002211283246e-05, + "loss": 2.8461, + "step": 3371500 + }, + { + "epoch": 1.0482352996034923, + "grad_norm": 8.777994155883789, + "learning_rate": 3.2529411673275127e-05, + "loss": 2.8834, + "step": 3372000 + }, + { + "epoch": 1.0483907318839791, + "grad_norm": 8.770390510559082, + "learning_rate": 3.2526821135267014e-05, + "loss": 2.8974, + "step": 3372500 + }, + { + "epoch": 1.048546164164466, + "grad_norm": 11.187015533447266, + "learning_rate": 3.25242305972589e-05, + "loss": 2.8428, + "step": 3373000 + }, + { + "epoch": 1.0487015964449529, + "grad_norm": 9.808154106140137, + "learning_rate": 3.252164005925079e-05, + "loss": 2.8878, + "step": 3373500 + }, + { + "epoch": 1.0488570287254397, + "grad_norm": 8.049337387084961, + "learning_rate": 3.2519049521242675e-05, + "loss": 2.8909, + "step": 3374000 + }, + { + "epoch": 1.0490124610059266, + "grad_norm": 9.071319580078125, + "learning_rate": 3.251645898323456e-05, + "loss": 2.8373, + "step": 3374500 + }, + { + "epoch": 1.0491678932864135, + "grad_norm": 7.850836277008057, + "learning_rate": 3.251386844522644e-05, + "loss": 2.8391, + "step": 3375000 + }, + { + "epoch": 1.0493233255669003, + "grad_norm": 7.119524002075195, + "learning_rate": 3.251127790721833e-05, + "loss": 2.8553, + "step": 3375500 + }, + { + "epoch": 1.0494787578473872, + "grad_norm": 5.977488040924072, + "learning_rate": 3.250868736921021e-05, + "loss": 2.8348, + "step": 3376000 + }, + { + "epoch": 1.049634190127874, + "grad_norm": 6.1781229972839355, + "learning_rate": 3.25060968312021e-05, + "loss": 2.8657, + "step": 3376500 + }, + { + "epoch": 1.049789622408361, + "grad_norm": 9.584866523742676, + "learning_rate": 3.2503506293193985e-05, + "loss": 2.8483, + "step": 3377000 + }, + { + "epoch": 1.0499450546888478, + "grad_norm": 13.02830982208252, + "learning_rate": 3.2500915755185865e-05, + "loss": 2.8565, + "step": 3377500 + }, + { + "epoch": 1.0501004869693347, + "grad_norm": 8.913670539855957, + "learning_rate": 3.249832521717775e-05, + "loss": 2.8711, + "step": 3378000 + }, + { + "epoch": 1.0502559192498215, + "grad_norm": 13.018719673156738, + "learning_rate": 3.2495734679169646e-05, + "loss": 2.8391, + "step": 3378500 + }, + { + "epoch": 1.0504113515303086, + "grad_norm": 9.217266082763672, + "learning_rate": 3.2493144141161526e-05, + "loss": 2.8916, + "step": 3379000 + }, + { + "epoch": 1.0505667838107955, + "grad_norm": 11.71316909790039, + "learning_rate": 3.2490553603153414e-05, + "loss": 2.8564, + "step": 3379500 + }, + { + "epoch": 1.0507222160912824, + "grad_norm": 10.367610931396484, + "learning_rate": 3.24879630651453e-05, + "loss": 2.8601, + "step": 3380000 + }, + { + "epoch": 1.0508776483717692, + "grad_norm": 7.946877479553223, + "learning_rate": 3.248537252713718e-05, + "loss": 2.8796, + "step": 3380500 + }, + { + "epoch": 1.051033080652256, + "grad_norm": 11.3272123336792, + "learning_rate": 3.248278198912907e-05, + "loss": 2.8738, + "step": 3381000 + }, + { + "epoch": 1.051188512932743, + "grad_norm": 8.713533401489258, + "learning_rate": 3.2480191451120955e-05, + "loss": 2.8442, + "step": 3381500 + }, + { + "epoch": 1.0513439452132298, + "grad_norm": 8.333279609680176, + "learning_rate": 3.2477600913112836e-05, + "loss": 2.8773, + "step": 3382000 + }, + { + "epoch": 1.0514993774937167, + "grad_norm": 8.522666931152344, + "learning_rate": 3.247501037510472e-05, + "loss": 2.8944, + "step": 3382500 + }, + { + "epoch": 1.0516548097742036, + "grad_norm": 8.776286125183105, + "learning_rate": 3.247241983709661e-05, + "loss": 2.8752, + "step": 3383000 + }, + { + "epoch": 1.0518102420546904, + "grad_norm": 10.237665176391602, + "learning_rate": 3.24698292990885e-05, + "loss": 2.8622, + "step": 3383500 + }, + { + "epoch": 1.0519656743351773, + "grad_norm": 11.177488327026367, + "learning_rate": 3.2467238761080384e-05, + "loss": 2.8846, + "step": 3384000 + }, + { + "epoch": 1.0521211066156642, + "grad_norm": 10.0586519241333, + "learning_rate": 3.2464648223072265e-05, + "loss": 2.881, + "step": 3384500 + }, + { + "epoch": 1.052276538896151, + "grad_norm": 9.6727933883667, + "learning_rate": 3.246205768506415e-05, + "loss": 2.8379, + "step": 3385000 + }, + { + "epoch": 1.052431971176638, + "grad_norm": 12.004815101623535, + "learning_rate": 3.245946714705604e-05, + "loss": 2.8351, + "step": 3385500 + }, + { + "epoch": 1.0525874034571248, + "grad_norm": 13.208479881286621, + "learning_rate": 3.245687660904792e-05, + "loss": 2.8503, + "step": 3386000 + }, + { + "epoch": 1.0527428357376116, + "grad_norm": 9.809303283691406, + "learning_rate": 3.2454286071039807e-05, + "loss": 2.8634, + "step": 3386500 + }, + { + "epoch": 1.0528982680180985, + "grad_norm": 7.496593952178955, + "learning_rate": 3.2451695533031694e-05, + "loss": 2.7986, + "step": 3387000 + }, + { + "epoch": 1.0530537002985854, + "grad_norm": 10.199553489685059, + "learning_rate": 3.2449104995023574e-05, + "loss": 2.8759, + "step": 3387500 + }, + { + "epoch": 1.0532091325790722, + "grad_norm": 7.594255447387695, + "learning_rate": 3.244651445701546e-05, + "loss": 2.882, + "step": 3388000 + }, + { + "epoch": 1.053364564859559, + "grad_norm": 8.193445205688477, + "learning_rate": 3.244392391900735e-05, + "loss": 2.9098, + "step": 3388500 + }, + { + "epoch": 1.053519997140046, + "grad_norm": 9.093660354614258, + "learning_rate": 3.2441333380999236e-05, + "loss": 2.8325, + "step": 3389000 + }, + { + "epoch": 1.0536754294205328, + "grad_norm": 9.427735328674316, + "learning_rate": 3.243874284299112e-05, + "loss": 2.8431, + "step": 3389500 + }, + { + "epoch": 1.0538308617010197, + "grad_norm": 10.882515907287598, + "learning_rate": 3.2436152304983e-05, + "loss": 2.8321, + "step": 3390000 + }, + { + "epoch": 1.0539862939815066, + "grad_norm": 7.571011066436768, + "learning_rate": 3.243356176697489e-05, + "loss": 2.8551, + "step": 3390500 + }, + { + "epoch": 1.0541417262619937, + "grad_norm": 8.213763236999512, + "learning_rate": 3.243097122896678e-05, + "loss": 2.9043, + "step": 3391000 + }, + { + "epoch": 1.0542971585424805, + "grad_norm": 8.812474250793457, + "learning_rate": 3.242838069095866e-05, + "loss": 2.8377, + "step": 3391500 + }, + { + "epoch": 1.0544525908229674, + "grad_norm": 9.856983184814453, + "learning_rate": 3.2425790152950545e-05, + "loss": 2.8026, + "step": 3392000 + }, + { + "epoch": 1.0546080231034543, + "grad_norm": 10.415207862854004, + "learning_rate": 3.242319961494243e-05, + "loss": 2.8625, + "step": 3392500 + }, + { + "epoch": 1.0547634553839411, + "grad_norm": 9.277179718017578, + "learning_rate": 3.242060907693432e-05, + "loss": 2.8926, + "step": 3393000 + }, + { + "epoch": 1.054918887664428, + "grad_norm": 8.654946327209473, + "learning_rate": 3.2418018538926206e-05, + "loss": 2.8814, + "step": 3393500 + }, + { + "epoch": 1.0550743199449149, + "grad_norm": 9.249979019165039, + "learning_rate": 3.241542800091809e-05, + "loss": 2.8587, + "step": 3394000 + }, + { + "epoch": 1.0552297522254017, + "grad_norm": 7.358203411102295, + "learning_rate": 3.2412837462909974e-05, + "loss": 2.857, + "step": 3394500 + }, + { + "epoch": 1.0553851845058886, + "grad_norm": 7.291035175323486, + "learning_rate": 3.241024692490186e-05, + "loss": 2.8531, + "step": 3395000 + }, + { + "epoch": 1.0555406167863755, + "grad_norm": 9.691112518310547, + "learning_rate": 3.240765638689374e-05, + "loss": 2.8697, + "step": 3395500 + }, + { + "epoch": 1.0556960490668623, + "grad_norm": 9.109711647033691, + "learning_rate": 3.240506584888563e-05, + "loss": 2.8533, + "step": 3396000 + }, + { + "epoch": 1.0558514813473492, + "grad_norm": 8.043835639953613, + "learning_rate": 3.2402475310877516e-05, + "loss": 2.8977, + "step": 3396500 + }, + { + "epoch": 1.056006913627836, + "grad_norm": 11.911575317382812, + "learning_rate": 3.2399884772869396e-05, + "loss": 2.8351, + "step": 3397000 + }, + { + "epoch": 1.056162345908323, + "grad_norm": 7.849698066711426, + "learning_rate": 3.239729423486128e-05, + "loss": 2.8796, + "step": 3397500 + }, + { + "epoch": 1.0563177781888098, + "grad_norm": 8.763044357299805, + "learning_rate": 3.239470369685317e-05, + "loss": 2.8732, + "step": 3398000 + }, + { + "epoch": 1.0564732104692967, + "grad_norm": 8.672654151916504, + "learning_rate": 3.239211315884506e-05, + "loss": 2.8605, + "step": 3398500 + }, + { + "epoch": 1.0566286427497835, + "grad_norm": 8.755391120910645, + "learning_rate": 3.2389522620836945e-05, + "loss": 2.8199, + "step": 3399000 + }, + { + "epoch": 1.0567840750302704, + "grad_norm": 6.941488742828369, + "learning_rate": 3.238693208282883e-05, + "loss": 2.8437, + "step": 3399500 + }, + { + "epoch": 1.0569395073107573, + "grad_norm": 8.97756290435791, + "learning_rate": 3.238434154482071e-05, + "loss": 2.8842, + "step": 3400000 + }, + { + "epoch": 1.0570949395912441, + "grad_norm": 8.340110778808594, + "learning_rate": 3.23817510068126e-05, + "loss": 2.9022, + "step": 3400500 + }, + { + "epoch": 1.057250371871731, + "grad_norm": 7.505934238433838, + "learning_rate": 3.237916046880448e-05, + "loss": 2.8611, + "step": 3401000 + }, + { + "epoch": 1.0574058041522179, + "grad_norm": 13.45118522644043, + "learning_rate": 3.237656993079637e-05, + "loss": 2.8678, + "step": 3401500 + }, + { + "epoch": 1.0575612364327047, + "grad_norm": 9.090479850769043, + "learning_rate": 3.2373979392788254e-05, + "loss": 2.8363, + "step": 3402000 + }, + { + "epoch": 1.0577166687131916, + "grad_norm": 8.324230194091797, + "learning_rate": 3.237138885478014e-05, + "loss": 2.8644, + "step": 3402500 + }, + { + "epoch": 1.0578721009936785, + "grad_norm": 6.91402530670166, + "learning_rate": 3.236879831677203e-05, + "loss": 2.8669, + "step": 3403000 + }, + { + "epoch": 1.0580275332741655, + "grad_norm": 9.027361869812012, + "learning_rate": 3.2366207778763916e-05, + "loss": 2.8593, + "step": 3403500 + }, + { + "epoch": 1.0581829655546524, + "grad_norm": 7.9334516525268555, + "learning_rate": 3.2363617240755796e-05, + "loss": 2.838, + "step": 3404000 + }, + { + "epoch": 1.0583383978351393, + "grad_norm": 7.4131646156311035, + "learning_rate": 3.236102670274768e-05, + "loss": 2.879, + "step": 3404500 + }, + { + "epoch": 1.0584938301156261, + "grad_norm": 7.36235237121582, + "learning_rate": 3.235843616473957e-05, + "loss": 2.8408, + "step": 3405000 + }, + { + "epoch": 1.058649262396113, + "grad_norm": 8.514654159545898, + "learning_rate": 3.235584562673145e-05, + "loss": 2.8257, + "step": 3405500 + }, + { + "epoch": 1.0588046946765999, + "grad_norm": 7.978859901428223, + "learning_rate": 3.235325508872334e-05, + "loss": 2.9168, + "step": 3406000 + }, + { + "epoch": 1.0589601269570867, + "grad_norm": 9.44537353515625, + "learning_rate": 3.235066455071522e-05, + "loss": 2.852, + "step": 3406500 + }, + { + "epoch": 1.0591155592375736, + "grad_norm": 11.952940940856934, + "learning_rate": 3.2348074012707105e-05, + "loss": 2.8624, + "step": 3407000 + }, + { + "epoch": 1.0592709915180605, + "grad_norm": 9.14855670928955, + "learning_rate": 3.234548347469899e-05, + "loss": 2.8232, + "step": 3407500 + }, + { + "epoch": 1.0594264237985473, + "grad_norm": 11.041703224182129, + "learning_rate": 3.234289293669088e-05, + "loss": 2.8181, + "step": 3408000 + }, + { + "epoch": 1.0595818560790342, + "grad_norm": 8.633076667785645, + "learning_rate": 3.234030239868277e-05, + "loss": 2.8391, + "step": 3408500 + }, + { + "epoch": 1.059737288359521, + "grad_norm": 9.14112377166748, + "learning_rate": 3.2337711860674654e-05, + "loss": 2.8815, + "step": 3409000 + }, + { + "epoch": 1.059892720640008, + "grad_norm": 26.912858963012695, + "learning_rate": 3.2335121322666534e-05, + "loss": 2.8775, + "step": 3409500 + }, + { + "epoch": 1.0600481529204948, + "grad_norm": 7.697815418243408, + "learning_rate": 3.233253078465842e-05, + "loss": 2.8824, + "step": 3410000 + }, + { + "epoch": 1.0602035852009817, + "grad_norm": 5.724733829498291, + "learning_rate": 3.232994024665031e-05, + "loss": 2.8515, + "step": 3410500 + }, + { + "epoch": 1.0603590174814685, + "grad_norm": 10.388978958129883, + "learning_rate": 3.232734970864219e-05, + "loss": 2.9043, + "step": 3411000 + }, + { + "epoch": 1.0605144497619554, + "grad_norm": 9.278973579406738, + "learning_rate": 3.2324759170634076e-05, + "loss": 2.8568, + "step": 3411500 + }, + { + "epoch": 1.0606698820424423, + "grad_norm": 10.475787162780762, + "learning_rate": 3.2322168632625957e-05, + "loss": 2.8627, + "step": 3412000 + }, + { + "epoch": 1.0608253143229291, + "grad_norm": 7.487390518188477, + "learning_rate": 3.231957809461785e-05, + "loss": 2.8517, + "step": 3412500 + }, + { + "epoch": 1.060980746603416, + "grad_norm": 8.137925148010254, + "learning_rate": 3.231698755660974e-05, + "loss": 2.8813, + "step": 3413000 + }, + { + "epoch": 1.0611361788839029, + "grad_norm": 7.610116958618164, + "learning_rate": 3.231439701860162e-05, + "loss": 2.8679, + "step": 3413500 + }, + { + "epoch": 1.0612916111643897, + "grad_norm": 7.172387599945068, + "learning_rate": 3.2311806480593505e-05, + "loss": 2.8743, + "step": 3414000 + }, + { + "epoch": 1.0614470434448766, + "grad_norm": 7.645350456237793, + "learning_rate": 3.230921594258539e-05, + "loss": 2.8726, + "step": 3414500 + }, + { + "epoch": 1.0616024757253637, + "grad_norm": 9.98416519165039, + "learning_rate": 3.230662540457727e-05, + "loss": 2.8835, + "step": 3415000 + }, + { + "epoch": 1.0617579080058506, + "grad_norm": 10.188164710998535, + "learning_rate": 3.230403486656916e-05, + "loss": 2.8482, + "step": 3415500 + }, + { + "epoch": 1.0619133402863374, + "grad_norm": 8.938488006591797, + "learning_rate": 3.230144432856105e-05, + "loss": 2.8236, + "step": 3416000 + }, + { + "epoch": 1.0620687725668243, + "grad_norm": 11.028077125549316, + "learning_rate": 3.229885379055293e-05, + "loss": 2.8735, + "step": 3416500 + }, + { + "epoch": 1.0622242048473112, + "grad_norm": 7.773472309112549, + "learning_rate": 3.2296263252544815e-05, + "loss": 2.8552, + "step": 3417000 + }, + { + "epoch": 1.062379637127798, + "grad_norm": 20.55824851989746, + "learning_rate": 3.22936727145367e-05, + "loss": 2.8978, + "step": 3417500 + }, + { + "epoch": 1.062535069408285, + "grad_norm": 8.902398109436035, + "learning_rate": 3.229108217652859e-05, + "loss": 2.8879, + "step": 3418000 + }, + { + "epoch": 1.0626905016887718, + "grad_norm": 9.714091300964355, + "learning_rate": 3.2288491638520476e-05, + "loss": 2.8458, + "step": 3418500 + }, + { + "epoch": 1.0628459339692586, + "grad_norm": 7.789453506469727, + "learning_rate": 3.2285901100512356e-05, + "loss": 2.8177, + "step": 3419000 + }, + { + "epoch": 1.0630013662497455, + "grad_norm": 13.910080909729004, + "learning_rate": 3.2283310562504243e-05, + "loss": 2.8847, + "step": 3419500 + }, + { + "epoch": 1.0631567985302324, + "grad_norm": 8.069589614868164, + "learning_rate": 3.228072002449613e-05, + "loss": 2.869, + "step": 3420000 + }, + { + "epoch": 1.0633122308107192, + "grad_norm": 9.058860778808594, + "learning_rate": 3.227812948648801e-05, + "loss": 2.8919, + "step": 3420500 + }, + { + "epoch": 1.063467663091206, + "grad_norm": 9.739229202270508, + "learning_rate": 3.22755389484799e-05, + "loss": 2.8744, + "step": 3421000 + }, + { + "epoch": 1.063623095371693, + "grad_norm": 7.598367691040039, + "learning_rate": 3.2272948410471785e-05, + "loss": 2.7933, + "step": 3421500 + }, + { + "epoch": 1.0637785276521798, + "grad_norm": 11.764344215393066, + "learning_rate": 3.2270357872463666e-05, + "loss": 2.8463, + "step": 3422000 + }, + { + "epoch": 1.0639339599326667, + "grad_norm": 17.78154945373535, + "learning_rate": 3.226776733445556e-05, + "loss": 2.846, + "step": 3422500 + }, + { + "epoch": 1.0640893922131536, + "grad_norm": 44.393653869628906, + "learning_rate": 3.226517679644745e-05, + "loss": 2.8433, + "step": 3423000 + }, + { + "epoch": 1.0642448244936404, + "grad_norm": 9.282220840454102, + "learning_rate": 3.226258625843933e-05, + "loss": 2.9069, + "step": 3423500 + }, + { + "epoch": 1.0644002567741273, + "grad_norm": 10.74553394317627, + "learning_rate": 3.2259995720431214e-05, + "loss": 2.8541, + "step": 3424000 + }, + { + "epoch": 1.0645556890546142, + "grad_norm": 9.589699745178223, + "learning_rate": 3.2257405182423095e-05, + "loss": 2.8531, + "step": 3424500 + }, + { + "epoch": 1.064711121335101, + "grad_norm": 8.052886009216309, + "learning_rate": 3.225481464441498e-05, + "loss": 2.8539, + "step": 3425000 + }, + { + "epoch": 1.064866553615588, + "grad_norm": 22.47702980041504, + "learning_rate": 3.225222410640687e-05, + "loss": 2.8889, + "step": 3425500 + }, + { + "epoch": 1.0650219858960748, + "grad_norm": 9.689362525939941, + "learning_rate": 3.224963356839875e-05, + "loss": 2.8381, + "step": 3426000 + }, + { + "epoch": 1.0651774181765616, + "grad_norm": 10.49497127532959, + "learning_rate": 3.2247043030390637e-05, + "loss": 2.8547, + "step": 3426500 + }, + { + "epoch": 1.0653328504570485, + "grad_norm": 7.208100318908691, + "learning_rate": 3.2244452492382524e-05, + "loss": 2.8729, + "step": 3427000 + }, + { + "epoch": 1.0654882827375356, + "grad_norm": 8.309297561645508, + "learning_rate": 3.224186195437441e-05, + "loss": 2.833, + "step": 3427500 + }, + { + "epoch": 1.0656437150180225, + "grad_norm": 10.167060852050781, + "learning_rate": 3.22392714163663e-05, + "loss": 2.8649, + "step": 3428000 + }, + { + "epoch": 1.0657991472985093, + "grad_norm": 10.642210960388184, + "learning_rate": 3.2236680878358185e-05, + "loss": 2.891, + "step": 3428500 + }, + { + "epoch": 1.0659545795789962, + "grad_norm": 9.628883361816406, + "learning_rate": 3.2234090340350066e-05, + "loss": 2.863, + "step": 3429000 + }, + { + "epoch": 1.066110011859483, + "grad_norm": 6.896968841552734, + "learning_rate": 3.223149980234195e-05, + "loss": 2.8727, + "step": 3429500 + }, + { + "epoch": 1.06626544413997, + "grad_norm": 9.666546821594238, + "learning_rate": 3.222890926433383e-05, + "loss": 2.8848, + "step": 3430000 + }, + { + "epoch": 1.0664208764204568, + "grad_norm": 14.759296417236328, + "learning_rate": 3.222631872632572e-05, + "loss": 2.86, + "step": 3430500 + }, + { + "epoch": 1.0665763087009437, + "grad_norm": 10.501471519470215, + "learning_rate": 3.222372818831761e-05, + "loss": 2.8218, + "step": 3431000 + }, + { + "epoch": 1.0667317409814305, + "grad_norm": 8.725295066833496, + "learning_rate": 3.222113765030949e-05, + "loss": 2.84, + "step": 3431500 + }, + { + "epoch": 1.0668871732619174, + "grad_norm": 9.506980895996094, + "learning_rate": 3.2218547112301375e-05, + "loss": 2.8331, + "step": 3432000 + }, + { + "epoch": 1.0670426055424043, + "grad_norm": 23.374753952026367, + "learning_rate": 3.221595657429327e-05, + "loss": 2.8501, + "step": 3432500 + }, + { + "epoch": 1.0671980378228911, + "grad_norm": 8.050731658935547, + "learning_rate": 3.221336603628515e-05, + "loss": 2.8923, + "step": 3433000 + }, + { + "epoch": 1.067353470103378, + "grad_norm": 9.56890869140625, + "learning_rate": 3.2210775498277036e-05, + "loss": 2.8537, + "step": 3433500 + }, + { + "epoch": 1.0675089023838649, + "grad_norm": 15.022614479064941, + "learning_rate": 3.2208184960268924e-05, + "loss": 2.8317, + "step": 3434000 + }, + { + "epoch": 1.0676643346643517, + "grad_norm": 6.944094657897949, + "learning_rate": 3.2205594422260804e-05, + "loss": 2.8776, + "step": 3434500 + }, + { + "epoch": 1.0678197669448386, + "grad_norm": 8.139132499694824, + "learning_rate": 3.220300388425269e-05, + "loss": 2.8353, + "step": 3435000 + }, + { + "epoch": 1.0679751992253255, + "grad_norm": 9.76451587677002, + "learning_rate": 3.220041334624458e-05, + "loss": 2.8367, + "step": 3435500 + }, + { + "epoch": 1.0681306315058123, + "grad_norm": 9.115972518920898, + "learning_rate": 3.219782280823646e-05, + "loss": 2.8493, + "step": 3436000 + }, + { + "epoch": 1.0682860637862992, + "grad_norm": 8.35020637512207, + "learning_rate": 3.2195232270228346e-05, + "loss": 2.8315, + "step": 3436500 + }, + { + "epoch": 1.068441496066786, + "grad_norm": 9.741632461547852, + "learning_rate": 3.219264173222023e-05, + "loss": 2.8482, + "step": 3437000 + }, + { + "epoch": 1.068596928347273, + "grad_norm": 8.448067665100098, + "learning_rate": 3.219005119421212e-05, + "loss": 2.8398, + "step": 3437500 + }, + { + "epoch": 1.0687523606277598, + "grad_norm": 8.937536239624023, + "learning_rate": 3.218746065620401e-05, + "loss": 2.8248, + "step": 3438000 + }, + { + "epoch": 1.0689077929082467, + "grad_norm": 8.791178703308105, + "learning_rate": 3.218487011819589e-05, + "loss": 2.85, + "step": 3438500 + }, + { + "epoch": 1.0690632251887338, + "grad_norm": 8.295760154724121, + "learning_rate": 3.2182279580187775e-05, + "loss": 2.864, + "step": 3439000 + }, + { + "epoch": 1.0692186574692206, + "grad_norm": 8.760025024414062, + "learning_rate": 3.217968904217966e-05, + "loss": 2.916, + "step": 3439500 + }, + { + "epoch": 1.0693740897497075, + "grad_norm": 8.936299324035645, + "learning_rate": 3.217709850417154e-05, + "loss": 2.894, + "step": 3440000 + }, + { + "epoch": 1.0695295220301944, + "grad_norm": 9.120868682861328, + "learning_rate": 3.217450796616343e-05, + "loss": 2.8431, + "step": 3440500 + }, + { + "epoch": 1.0696849543106812, + "grad_norm": 8.406193733215332, + "learning_rate": 3.2171917428155317e-05, + "loss": 2.854, + "step": 3441000 + }, + { + "epoch": 1.069840386591168, + "grad_norm": 12.804454803466797, + "learning_rate": 3.21693268901472e-05, + "loss": 2.9226, + "step": 3441500 + }, + { + "epoch": 1.069995818871655, + "grad_norm": 9.891057968139648, + "learning_rate": 3.2166736352139084e-05, + "loss": 2.8066, + "step": 3442000 + }, + { + "epoch": 1.0701512511521418, + "grad_norm": 7.36544942855835, + "learning_rate": 3.216414581413097e-05, + "loss": 2.8981, + "step": 3442500 + }, + { + "epoch": 1.0703066834326287, + "grad_norm": 9.9996976852417, + "learning_rate": 3.216155527612286e-05, + "loss": 2.8453, + "step": 3443000 + }, + { + "epoch": 1.0704621157131156, + "grad_norm": 8.409035682678223, + "learning_rate": 3.2158964738114746e-05, + "loss": 2.861, + "step": 3443500 + }, + { + "epoch": 1.0706175479936024, + "grad_norm": 8.324844360351562, + "learning_rate": 3.2156374200106626e-05, + "loss": 2.8461, + "step": 3444000 + }, + { + "epoch": 1.0707729802740893, + "grad_norm": 12.122032165527344, + "learning_rate": 3.215378366209851e-05, + "loss": 2.8311, + "step": 3444500 + }, + { + "epoch": 1.0709284125545762, + "grad_norm": 9.098759651184082, + "learning_rate": 3.21511931240904e-05, + "loss": 2.8487, + "step": 3445000 + }, + { + "epoch": 1.071083844835063, + "grad_norm": 8.66715145111084, + "learning_rate": 3.214860258608228e-05, + "loss": 2.8647, + "step": 3445500 + }, + { + "epoch": 1.07123927711555, + "grad_norm": 13.412760734558105, + "learning_rate": 3.214601204807417e-05, + "loss": 2.8667, + "step": 3446000 + }, + { + "epoch": 1.0713947093960368, + "grad_norm": 6.803606986999512, + "learning_rate": 3.2143421510066055e-05, + "loss": 2.9043, + "step": 3446500 + }, + { + "epoch": 1.0715501416765236, + "grad_norm": 9.223868370056152, + "learning_rate": 3.214083097205794e-05, + "loss": 2.8879, + "step": 3447000 + }, + { + "epoch": 1.0717055739570105, + "grad_norm": 7.034148693084717, + "learning_rate": 3.213824043404983e-05, + "loss": 2.8782, + "step": 3447500 + }, + { + "epoch": 1.0718610062374974, + "grad_norm": 9.272462844848633, + "learning_rate": 3.213564989604171e-05, + "loss": 2.8507, + "step": 3448000 + }, + { + "epoch": 1.0720164385179842, + "grad_norm": 6.848198413848877, + "learning_rate": 3.21330593580336e-05, + "loss": 2.8429, + "step": 3448500 + }, + { + "epoch": 1.072171870798471, + "grad_norm": 6.285412311553955, + "learning_rate": 3.2130468820025484e-05, + "loss": 2.8676, + "step": 3449000 + }, + { + "epoch": 1.072327303078958, + "grad_norm": 14.254814147949219, + "learning_rate": 3.2127878282017364e-05, + "loss": 2.8446, + "step": 3449500 + }, + { + "epoch": 1.0724827353594448, + "grad_norm": 8.86953067779541, + "learning_rate": 3.212528774400925e-05, + "loss": 2.8374, + "step": 3450000 + }, + { + "epoch": 1.0726381676399317, + "grad_norm": 14.245584487915039, + "learning_rate": 3.212269720600114e-05, + "loss": 2.8367, + "step": 3450500 + }, + { + "epoch": 1.0727935999204186, + "grad_norm": 8.414252281188965, + "learning_rate": 3.212010666799302e-05, + "loss": 2.8435, + "step": 3451000 + }, + { + "epoch": 1.0729490322009057, + "grad_norm": 20.592418670654297, + "learning_rate": 3.2117516129984906e-05, + "loss": 2.8557, + "step": 3451500 + }, + { + "epoch": 1.0731044644813925, + "grad_norm": 8.299610137939453, + "learning_rate": 3.211492559197679e-05, + "loss": 2.8884, + "step": 3452000 + }, + { + "epoch": 1.0732598967618794, + "grad_norm": 19.288171768188477, + "learning_rate": 3.211233505396868e-05, + "loss": 2.852, + "step": 3452500 + }, + { + "epoch": 1.0734153290423663, + "grad_norm": 10.882865905761719, + "learning_rate": 3.210974451596057e-05, + "loss": 2.8464, + "step": 3453000 + }, + { + "epoch": 1.0735707613228531, + "grad_norm": 9.339032173156738, + "learning_rate": 3.2107153977952455e-05, + "loss": 2.8595, + "step": 3453500 + }, + { + "epoch": 1.07372619360334, + "grad_norm": 8.39345645904541, + "learning_rate": 3.2104563439944335e-05, + "loss": 2.8276, + "step": 3454000 + }, + { + "epoch": 1.0738816258838269, + "grad_norm": 8.623703002929688, + "learning_rate": 3.210197290193622e-05, + "loss": 2.8592, + "step": 3454500 + }, + { + "epoch": 1.0740370581643137, + "grad_norm": 19.974838256835938, + "learning_rate": 3.20993823639281e-05, + "loss": 2.8848, + "step": 3455000 + }, + { + "epoch": 1.0741924904448006, + "grad_norm": 8.121817588806152, + "learning_rate": 3.209679182591999e-05, + "loss": 2.892, + "step": 3455500 + }, + { + "epoch": 1.0743479227252875, + "grad_norm": 8.592859268188477, + "learning_rate": 3.209420128791188e-05, + "loss": 2.8517, + "step": 3456000 + }, + { + "epoch": 1.0745033550057743, + "grad_norm": 80.64271545410156, + "learning_rate": 3.2091610749903764e-05, + "loss": 2.8429, + "step": 3456500 + }, + { + "epoch": 1.0746587872862612, + "grad_norm": 8.952055931091309, + "learning_rate": 3.208902021189565e-05, + "loss": 2.8565, + "step": 3457000 + }, + { + "epoch": 1.074814219566748, + "grad_norm": 7.176537990570068, + "learning_rate": 3.208642967388754e-05, + "loss": 2.7661, + "step": 3457500 + }, + { + "epoch": 1.074969651847235, + "grad_norm": 7.685976028442383, + "learning_rate": 3.208383913587942e-05, + "loss": 2.8732, + "step": 3458000 + }, + { + "epoch": 1.0751250841277218, + "grad_norm": 10.61179256439209, + "learning_rate": 3.2081248597871306e-05, + "loss": 2.8619, + "step": 3458500 + }, + { + "epoch": 1.0752805164082087, + "grad_norm": 7.48563289642334, + "learning_rate": 3.207865805986319e-05, + "loss": 2.8344, + "step": 3459000 + }, + { + "epoch": 1.0754359486886955, + "grad_norm": 8.493525505065918, + "learning_rate": 3.2076067521855073e-05, + "loss": 2.8743, + "step": 3459500 + }, + { + "epoch": 1.0755913809691824, + "grad_norm": 16.518693923950195, + "learning_rate": 3.207347698384696e-05, + "loss": 2.8481, + "step": 3460000 + }, + { + "epoch": 1.0757468132496693, + "grad_norm": 9.076696395874023, + "learning_rate": 3.207088644583884e-05, + "loss": 2.8289, + "step": 3460500 + }, + { + "epoch": 1.0759022455301561, + "grad_norm": 8.53506851196289, + "learning_rate": 3.206829590783073e-05, + "loss": 2.8592, + "step": 3461000 + }, + { + "epoch": 1.076057677810643, + "grad_norm": 11.551828384399414, + "learning_rate": 3.2065705369822615e-05, + "loss": 2.863, + "step": 3461500 + }, + { + "epoch": 1.0762131100911299, + "grad_norm": 9.176063537597656, + "learning_rate": 3.20631148318145e-05, + "loss": 2.8522, + "step": 3462000 + }, + { + "epoch": 1.0763685423716167, + "grad_norm": 8.935830116271973, + "learning_rate": 3.206052429380639e-05, + "loss": 2.8756, + "step": 3462500 + }, + { + "epoch": 1.0765239746521038, + "grad_norm": 7.401590347290039, + "learning_rate": 3.205793375579828e-05, + "loss": 2.8129, + "step": 3463000 + }, + { + "epoch": 1.0766794069325907, + "grad_norm": 23.453332901000977, + "learning_rate": 3.205534321779016e-05, + "loss": 2.9001, + "step": 3463500 + }, + { + "epoch": 1.0768348392130775, + "grad_norm": 8.71076488494873, + "learning_rate": 3.2052752679782044e-05, + "loss": 2.8673, + "step": 3464000 + }, + { + "epoch": 1.0769902714935644, + "grad_norm": 9.066500663757324, + "learning_rate": 3.205016214177393e-05, + "loss": 2.8493, + "step": 3464500 + }, + { + "epoch": 1.0771457037740513, + "grad_norm": 7.671414375305176, + "learning_rate": 3.204757160376581e-05, + "loss": 2.8383, + "step": 3465000 + }, + { + "epoch": 1.0773011360545381, + "grad_norm": 6.000478744506836, + "learning_rate": 3.20449810657577e-05, + "loss": 2.8366, + "step": 3465500 + }, + { + "epoch": 1.077456568335025, + "grad_norm": 13.726282119750977, + "learning_rate": 3.2042390527749586e-05, + "loss": 2.8701, + "step": 3466000 + }, + { + "epoch": 1.0776120006155119, + "grad_norm": 8.940569877624512, + "learning_rate": 3.203979998974147e-05, + "loss": 2.853, + "step": 3466500 + }, + { + "epoch": 1.0777674328959987, + "grad_norm": 7.667110443115234, + "learning_rate": 3.203720945173336e-05, + "loss": 2.8604, + "step": 3467000 + }, + { + "epoch": 1.0779228651764856, + "grad_norm": 9.142531394958496, + "learning_rate": 3.203461891372524e-05, + "loss": 2.8605, + "step": 3467500 + }, + { + "epoch": 1.0780782974569725, + "grad_norm": 9.066308975219727, + "learning_rate": 3.203202837571713e-05, + "loss": 2.8367, + "step": 3468000 + }, + { + "epoch": 1.0782337297374593, + "grad_norm": 8.60759162902832, + "learning_rate": 3.2029437837709015e-05, + "loss": 2.8368, + "step": 3468500 + }, + { + "epoch": 1.0783891620179462, + "grad_norm": 7.935755252838135, + "learning_rate": 3.2026847299700896e-05, + "loss": 2.8766, + "step": 3469000 + }, + { + "epoch": 1.078544594298433, + "grad_norm": 9.263415336608887, + "learning_rate": 3.202425676169278e-05, + "loss": 2.8831, + "step": 3469500 + }, + { + "epoch": 1.07870002657892, + "grad_norm": 12.2131986618042, + "learning_rate": 3.202166622368467e-05, + "loss": 2.8336, + "step": 3470000 + }, + { + "epoch": 1.0788554588594068, + "grad_norm": 7.830668926239014, + "learning_rate": 3.201907568567655e-05, + "loss": 2.8738, + "step": 3470500 + }, + { + "epoch": 1.0790108911398937, + "grad_norm": 8.118240356445312, + "learning_rate": 3.201648514766844e-05, + "loss": 2.8629, + "step": 3471000 + }, + { + "epoch": 1.0791663234203805, + "grad_norm": 21.76605987548828, + "learning_rate": 3.2013894609660324e-05, + "loss": 2.8443, + "step": 3471500 + }, + { + "epoch": 1.0793217557008674, + "grad_norm": 10.845240592956543, + "learning_rate": 3.201130407165221e-05, + "loss": 2.8498, + "step": 3472000 + }, + { + "epoch": 1.0794771879813543, + "grad_norm": 9.075138092041016, + "learning_rate": 3.20087135336441e-05, + "loss": 2.8145, + "step": 3472500 + }, + { + "epoch": 1.0796326202618411, + "grad_norm": 10.716361045837402, + "learning_rate": 3.200612299563598e-05, + "loss": 2.8752, + "step": 3473000 + }, + { + "epoch": 1.079788052542328, + "grad_norm": 10.07843017578125, + "learning_rate": 3.2003532457627866e-05, + "loss": 2.8707, + "step": 3473500 + }, + { + "epoch": 1.0799434848228149, + "grad_norm": 6.410126686096191, + "learning_rate": 3.2000941919619753e-05, + "loss": 2.8835, + "step": 3474000 + }, + { + "epoch": 1.0800989171033017, + "grad_norm": 7.977897644042969, + "learning_rate": 3.1998351381611634e-05, + "loss": 2.8548, + "step": 3474500 + }, + { + "epoch": 1.0802543493837886, + "grad_norm": 11.02508544921875, + "learning_rate": 3.199576084360352e-05, + "loss": 2.8516, + "step": 3475000 + }, + { + "epoch": 1.0804097816642757, + "grad_norm": 9.493731498718262, + "learning_rate": 3.199317030559541e-05, + "loss": 2.8695, + "step": 3475500 + }, + { + "epoch": 1.0805652139447626, + "grad_norm": 8.702515602111816, + "learning_rate": 3.1990579767587295e-05, + "loss": 2.8187, + "step": 3476000 + }, + { + "epoch": 1.0807206462252494, + "grad_norm": 8.292202949523926, + "learning_rate": 3.198798922957918e-05, + "loss": 2.8374, + "step": 3476500 + }, + { + "epoch": 1.0808760785057363, + "grad_norm": 8.12196159362793, + "learning_rate": 3.198539869157107e-05, + "loss": 2.874, + "step": 3477000 + }, + { + "epoch": 1.0810315107862232, + "grad_norm": 7.972033500671387, + "learning_rate": 3.198280815356295e-05, + "loss": 2.8435, + "step": 3477500 + }, + { + "epoch": 1.08118694306671, + "grad_norm": 9.31562614440918, + "learning_rate": 3.198021761555484e-05, + "loss": 2.895, + "step": 3478000 + }, + { + "epoch": 1.081342375347197, + "grad_norm": 7.970449447631836, + "learning_rate": 3.197762707754672e-05, + "loss": 2.9331, + "step": 3478500 + }, + { + "epoch": 1.0814978076276838, + "grad_norm": 6.448821544647217, + "learning_rate": 3.1975036539538605e-05, + "loss": 2.8325, + "step": 3479000 + }, + { + "epoch": 1.0816532399081706, + "grad_norm": 9.343764305114746, + "learning_rate": 3.197244600153049e-05, + "loss": 2.8861, + "step": 3479500 + }, + { + "epoch": 1.0818086721886575, + "grad_norm": 11.011688232421875, + "learning_rate": 3.196985546352237e-05, + "loss": 2.8334, + "step": 3480000 + }, + { + "epoch": 1.0819641044691444, + "grad_norm": 6.644672870635986, + "learning_rate": 3.196726492551426e-05, + "loss": 2.8497, + "step": 3480500 + }, + { + "epoch": 1.0821195367496312, + "grad_norm": 8.006393432617188, + "learning_rate": 3.1964674387506147e-05, + "loss": 2.8303, + "step": 3481000 + }, + { + "epoch": 1.082274969030118, + "grad_norm": 9.098848342895508, + "learning_rate": 3.1962083849498034e-05, + "loss": 2.8345, + "step": 3481500 + }, + { + "epoch": 1.082430401310605, + "grad_norm": 11.406760215759277, + "learning_rate": 3.195949331148992e-05, + "loss": 2.9076, + "step": 3482000 + }, + { + "epoch": 1.0825858335910918, + "grad_norm": 21.096023559570312, + "learning_rate": 3.195690277348181e-05, + "loss": 2.918, + "step": 3482500 + }, + { + "epoch": 1.0827412658715787, + "grad_norm": 14.043846130371094, + "learning_rate": 3.195431223547369e-05, + "loss": 2.8576, + "step": 3483000 + }, + { + "epoch": 1.0828966981520656, + "grad_norm": 11.357649803161621, + "learning_rate": 3.1951721697465576e-05, + "loss": 2.854, + "step": 3483500 + }, + { + "epoch": 1.0830521304325524, + "grad_norm": 9.419333457946777, + "learning_rate": 3.194913115945746e-05, + "loss": 2.8344, + "step": 3484000 + }, + { + "epoch": 1.0832075627130393, + "grad_norm": 7.84119987487793, + "learning_rate": 3.194654062144934e-05, + "loss": 2.9179, + "step": 3484500 + }, + { + "epoch": 1.0833629949935262, + "grad_norm": 9.28167724609375, + "learning_rate": 3.194395008344123e-05, + "loss": 2.8522, + "step": 3485000 + }, + { + "epoch": 1.083518427274013, + "grad_norm": 8.835888862609863, + "learning_rate": 3.194135954543311e-05, + "loss": 2.8485, + "step": 3485500 + }, + { + "epoch": 1.0836738595545, + "grad_norm": 7.04529333114624, + "learning_rate": 3.1938769007425005e-05, + "loss": 2.8543, + "step": 3486000 + }, + { + "epoch": 1.0838292918349868, + "grad_norm": 9.383784294128418, + "learning_rate": 3.193617846941689e-05, + "loss": 2.8374, + "step": 3486500 + }, + { + "epoch": 1.0839847241154739, + "grad_norm": 8.247817993164062, + "learning_rate": 3.193358793140877e-05, + "loss": 2.7973, + "step": 3487000 + }, + { + "epoch": 1.0841401563959607, + "grad_norm": 9.35502815246582, + "learning_rate": 3.193099739340066e-05, + "loss": 2.8833, + "step": 3487500 + }, + { + "epoch": 1.0842955886764476, + "grad_norm": 9.277974128723145, + "learning_rate": 3.1928406855392546e-05, + "loss": 2.8632, + "step": 3488000 + }, + { + "epoch": 1.0844510209569345, + "grad_norm": 79.77008819580078, + "learning_rate": 3.192581631738443e-05, + "loss": 2.8884, + "step": 3488500 + }, + { + "epoch": 1.0846064532374213, + "grad_norm": 7.557009220123291, + "learning_rate": 3.1923225779376314e-05, + "loss": 2.8534, + "step": 3489000 + }, + { + "epoch": 1.0847618855179082, + "grad_norm": 9.29391860961914, + "learning_rate": 3.19206352413682e-05, + "loss": 2.8322, + "step": 3489500 + }, + { + "epoch": 1.084917317798395, + "grad_norm": 6.397099018096924, + "learning_rate": 3.191804470336008e-05, + "loss": 2.8782, + "step": 3490000 + }, + { + "epoch": 1.085072750078882, + "grad_norm": 10.479498863220215, + "learning_rate": 3.191545416535197e-05, + "loss": 2.8804, + "step": 3490500 + }, + { + "epoch": 1.0852281823593688, + "grad_norm": 8.826865196228027, + "learning_rate": 3.1912863627343856e-05, + "loss": 2.8879, + "step": 3491000 + }, + { + "epoch": 1.0853836146398557, + "grad_norm": 7.9710612297058105, + "learning_rate": 3.191027308933574e-05, + "loss": 2.8462, + "step": 3491500 + }, + { + "epoch": 1.0855390469203425, + "grad_norm": 8.817770004272461, + "learning_rate": 3.190768255132763e-05, + "loss": 2.8622, + "step": 3492000 + }, + { + "epoch": 1.0856944792008294, + "grad_norm": 19.398550033569336, + "learning_rate": 3.190509201331951e-05, + "loss": 2.8556, + "step": 3492500 + }, + { + "epoch": 1.0858499114813163, + "grad_norm": 8.790654182434082, + "learning_rate": 3.19025014753114e-05, + "loss": 2.8884, + "step": 3493000 + }, + { + "epoch": 1.0860053437618031, + "grad_norm": 8.086095809936523, + "learning_rate": 3.1899910937303285e-05, + "loss": 2.8718, + "step": 3493500 + }, + { + "epoch": 1.08616077604229, + "grad_norm": 44.97760009765625, + "learning_rate": 3.1897320399295165e-05, + "loss": 2.8685, + "step": 3494000 + }, + { + "epoch": 1.0863162083227769, + "grad_norm": 12.644346237182617, + "learning_rate": 3.189472986128705e-05, + "loss": 2.844, + "step": 3494500 + }, + { + "epoch": 1.0864716406032637, + "grad_norm": 8.640166282653809, + "learning_rate": 3.189213932327894e-05, + "loss": 2.86, + "step": 3495000 + }, + { + "epoch": 1.0866270728837506, + "grad_norm": 8.735261917114258, + "learning_rate": 3.188954878527082e-05, + "loss": 2.8544, + "step": 3495500 + }, + { + "epoch": 1.0867825051642375, + "grad_norm": 13.922754287719727, + "learning_rate": 3.1886958247262714e-05, + "loss": 2.8582, + "step": 3496000 + }, + { + "epoch": 1.0869379374447243, + "grad_norm": 10.082252502441406, + "learning_rate": 3.1884367709254594e-05, + "loss": 2.8502, + "step": 3496500 + }, + { + "epoch": 1.0870933697252112, + "grad_norm": 9.497659683227539, + "learning_rate": 3.188177717124648e-05, + "loss": 2.8634, + "step": 3497000 + }, + { + "epoch": 1.087248802005698, + "grad_norm": 8.50019645690918, + "learning_rate": 3.187918663323837e-05, + "loss": 2.8757, + "step": 3497500 + }, + { + "epoch": 1.087404234286185, + "grad_norm": 8.672440528869629, + "learning_rate": 3.187659609523025e-05, + "loss": 2.9393, + "step": 3498000 + }, + { + "epoch": 1.0875596665666718, + "grad_norm": 11.167644500732422, + "learning_rate": 3.1874005557222136e-05, + "loss": 2.8393, + "step": 3498500 + }, + { + "epoch": 1.0877150988471587, + "grad_norm": 8.009954452514648, + "learning_rate": 3.187141501921402e-05, + "loss": 2.8618, + "step": 3499000 + }, + { + "epoch": 1.0878705311276455, + "grad_norm": 24.8337459564209, + "learning_rate": 3.1868824481205903e-05, + "loss": 2.8649, + "step": 3499500 + }, + { + "epoch": 1.0880259634081326, + "grad_norm": 8.277432441711426, + "learning_rate": 3.186623394319779e-05, + "loss": 2.8085, + "step": 3500000 + }, + { + "epoch": 1.0881813956886195, + "grad_norm": 9.072802543640137, + "learning_rate": 3.186364340518968e-05, + "loss": 2.8565, + "step": 3500500 + }, + { + "epoch": 1.0883368279691064, + "grad_norm": 8.515862464904785, + "learning_rate": 3.1861052867181565e-05, + "loss": 2.8315, + "step": 3501000 + }, + { + "epoch": 1.0884922602495932, + "grad_norm": 7.167074680328369, + "learning_rate": 3.185846232917345e-05, + "loss": 2.8562, + "step": 3501500 + }, + { + "epoch": 1.08864769253008, + "grad_norm": 9.2872314453125, + "learning_rate": 3.185587179116534e-05, + "loss": 2.8037, + "step": 3502000 + }, + { + "epoch": 1.088803124810567, + "grad_norm": 8.794393539428711, + "learning_rate": 3.185328125315722e-05, + "loss": 2.8869, + "step": 3502500 + }, + { + "epoch": 1.0889585570910538, + "grad_norm": 9.490242958068848, + "learning_rate": 3.185069071514911e-05, + "loss": 2.8043, + "step": 3503000 + }, + { + "epoch": 1.0891139893715407, + "grad_norm": 9.644129753112793, + "learning_rate": 3.184810017714099e-05, + "loss": 2.8544, + "step": 3503500 + }, + { + "epoch": 1.0892694216520276, + "grad_norm": 8.399873733520508, + "learning_rate": 3.1845509639132874e-05, + "loss": 2.8698, + "step": 3504000 + }, + { + "epoch": 1.0894248539325144, + "grad_norm": 8.490442276000977, + "learning_rate": 3.184291910112476e-05, + "loss": 2.868, + "step": 3504500 + }, + { + "epoch": 1.0895802862130013, + "grad_norm": 9.999784469604492, + "learning_rate": 3.184032856311664e-05, + "loss": 2.8297, + "step": 3505000 + }, + { + "epoch": 1.0897357184934882, + "grad_norm": 7.348875045776367, + "learning_rate": 3.183773802510853e-05, + "loss": 2.8421, + "step": 3505500 + }, + { + "epoch": 1.089891150773975, + "grad_norm": 14.098697662353516, + "learning_rate": 3.183514748710042e-05, + "loss": 2.8793, + "step": 3506000 + }, + { + "epoch": 1.090046583054462, + "grad_norm": 9.47559928894043, + "learning_rate": 3.18325569490923e-05, + "loss": 2.8689, + "step": 3506500 + }, + { + "epoch": 1.0902020153349488, + "grad_norm": 13.317289352416992, + "learning_rate": 3.182996641108419e-05, + "loss": 2.8728, + "step": 3507000 + }, + { + "epoch": 1.0903574476154356, + "grad_norm": 9.395106315612793, + "learning_rate": 3.182737587307608e-05, + "loss": 2.8896, + "step": 3507500 + }, + { + "epoch": 1.0905128798959225, + "grad_norm": 8.514032363891602, + "learning_rate": 3.182478533506796e-05, + "loss": 2.8373, + "step": 3508000 + }, + { + "epoch": 1.0906683121764094, + "grad_norm": 6.190799236297607, + "learning_rate": 3.1822194797059845e-05, + "loss": 2.854, + "step": 3508500 + }, + { + "epoch": 1.0908237444568962, + "grad_norm": 17.268491744995117, + "learning_rate": 3.1819604259051725e-05, + "loss": 2.8551, + "step": 3509000 + }, + { + "epoch": 1.090979176737383, + "grad_norm": 8.091720581054688, + "learning_rate": 3.181701372104361e-05, + "loss": 2.8731, + "step": 3509500 + }, + { + "epoch": 1.09113460901787, + "grad_norm": 8.087328910827637, + "learning_rate": 3.18144231830355e-05, + "loss": 2.8456, + "step": 3510000 + }, + { + "epoch": 1.0912900412983568, + "grad_norm": 9.673945426940918, + "learning_rate": 3.181183264502739e-05, + "loss": 2.8281, + "step": 3510500 + }, + { + "epoch": 1.091445473578844, + "grad_norm": 7.564610004425049, + "learning_rate": 3.1809242107019274e-05, + "loss": 2.8911, + "step": 3511000 + }, + { + "epoch": 1.0916009058593308, + "grad_norm": 8.531756401062012, + "learning_rate": 3.180665156901116e-05, + "loss": 2.8622, + "step": 3511500 + }, + { + "epoch": 1.0917563381398177, + "grad_norm": 7.767532825469971, + "learning_rate": 3.180406103100304e-05, + "loss": 2.865, + "step": 3512000 + }, + { + "epoch": 1.0919117704203045, + "grad_norm": 10.302302360534668, + "learning_rate": 3.180147049299493e-05, + "loss": 2.8305, + "step": 3512500 + }, + { + "epoch": 1.0920672027007914, + "grad_norm": 7.450089931488037, + "learning_rate": 3.1798879954986816e-05, + "loss": 2.8436, + "step": 3513000 + }, + { + "epoch": 1.0922226349812783, + "grad_norm": 10.096596717834473, + "learning_rate": 3.1796289416978696e-05, + "loss": 2.8735, + "step": 3513500 + }, + { + "epoch": 1.0923780672617651, + "grad_norm": 15.73692512512207, + "learning_rate": 3.1793698878970583e-05, + "loss": 2.8365, + "step": 3514000 + }, + { + "epoch": 1.092533499542252, + "grad_norm": 8.07757568359375, + "learning_rate": 3.1791108340962464e-05, + "loss": 2.9382, + "step": 3514500 + }, + { + "epoch": 1.0926889318227389, + "grad_norm": 11.03728199005127, + "learning_rate": 3.178851780295435e-05, + "loss": 2.8958, + "step": 3515000 + }, + { + "epoch": 1.0928443641032257, + "grad_norm": 8.644713401794434, + "learning_rate": 3.178592726494624e-05, + "loss": 2.9388, + "step": 3515500 + }, + { + "epoch": 1.0929997963837126, + "grad_norm": 9.022377967834473, + "learning_rate": 3.1783336726938125e-05, + "loss": 2.8398, + "step": 3516000 + }, + { + "epoch": 1.0931552286641995, + "grad_norm": 12.189324378967285, + "learning_rate": 3.178074618893001e-05, + "loss": 2.9138, + "step": 3516500 + }, + { + "epoch": 1.0933106609446863, + "grad_norm": 30.878555297851562, + "learning_rate": 3.17781556509219e-05, + "loss": 2.9597, + "step": 3517000 + }, + { + "epoch": 1.0934660932251732, + "grad_norm": 9.226027488708496, + "learning_rate": 3.177556511291378e-05, + "loss": 2.8553, + "step": 3517500 + }, + { + "epoch": 1.09362152550566, + "grad_norm": 8.719295501708984, + "learning_rate": 3.177297457490567e-05, + "loss": 2.8905, + "step": 3518000 + }, + { + "epoch": 1.093776957786147, + "grad_norm": 11.092608451843262, + "learning_rate": 3.1770384036897554e-05, + "loss": 2.8873, + "step": 3518500 + }, + { + "epoch": 1.0939323900666338, + "grad_norm": 185.5362091064453, + "learning_rate": 3.1767793498889435e-05, + "loss": 2.8934, + "step": 3519000 + }, + { + "epoch": 1.0940878223471207, + "grad_norm": 8.56281852722168, + "learning_rate": 3.176520296088132e-05, + "loss": 2.9241, + "step": 3519500 + }, + { + "epoch": 1.0942432546276075, + "grad_norm": 10.91929817199707, + "learning_rate": 3.176261242287321e-05, + "loss": 2.9364, + "step": 3520000 + }, + { + "epoch": 1.0943986869080944, + "grad_norm": 10.93734359741211, + "learning_rate": 3.1760021884865096e-05, + "loss": 2.9406, + "step": 3520500 + }, + { + "epoch": 1.0945541191885813, + "grad_norm": 8.447683334350586, + "learning_rate": 3.175743134685698e-05, + "loss": 2.9264, + "step": 3521000 + }, + { + "epoch": 1.0947095514690681, + "grad_norm": 10.515178680419922, + "learning_rate": 3.1754840808848864e-05, + "loss": 2.9141, + "step": 3521500 + }, + { + "epoch": 1.094864983749555, + "grad_norm": 22.866535186767578, + "learning_rate": 3.175225027084075e-05, + "loss": 2.897, + "step": 3522000 + }, + { + "epoch": 1.0950204160300419, + "grad_norm": 12.752693176269531, + "learning_rate": 3.174965973283264e-05, + "loss": 2.9274, + "step": 3522500 + }, + { + "epoch": 1.0951758483105287, + "grad_norm": 10.49576187133789, + "learning_rate": 3.174706919482452e-05, + "loss": 3.0028, + "step": 3523000 + }, + { + "epoch": 1.0953312805910156, + "grad_norm": 7.99498176574707, + "learning_rate": 3.1744478656816405e-05, + "loss": 2.9317, + "step": 3523500 + }, + { + "epoch": 1.0954867128715027, + "grad_norm": 10.19862174987793, + "learning_rate": 3.174188811880829e-05, + "loss": 2.9671, + "step": 3524000 + }, + { + "epoch": 1.0956421451519895, + "grad_norm": 16.077404022216797, + "learning_rate": 3.173929758080017e-05, + "loss": 2.9265, + "step": 3524500 + }, + { + "epoch": 1.0957975774324764, + "grad_norm": 9.264823913574219, + "learning_rate": 3.173670704279206e-05, + "loss": 2.9122, + "step": 3525000 + }, + { + "epoch": 1.0959530097129633, + "grad_norm": 9.497956275939941, + "learning_rate": 3.173411650478395e-05, + "loss": 2.876, + "step": 3525500 + }, + { + "epoch": 1.0961084419934501, + "grad_norm": 7.574928283691406, + "learning_rate": 3.1731525966775834e-05, + "loss": 2.9221, + "step": 3526000 + }, + { + "epoch": 1.096263874273937, + "grad_norm": 9.11922836303711, + "learning_rate": 3.172893542876772e-05, + "loss": 2.9144, + "step": 3526500 + }, + { + "epoch": 1.0964193065544239, + "grad_norm": 14.992292404174805, + "learning_rate": 3.17263448907596e-05, + "loss": 2.889, + "step": 3527000 + }, + { + "epoch": 1.0965747388349107, + "grad_norm": 10.073390007019043, + "learning_rate": 3.172375435275149e-05, + "loss": 2.8889, + "step": 3527500 + }, + { + "epoch": 1.0967301711153976, + "grad_norm": 7.631139278411865, + "learning_rate": 3.1721163814743376e-05, + "loss": 2.9162, + "step": 3528000 + }, + { + "epoch": 1.0968856033958845, + "grad_norm": 7.726728439331055, + "learning_rate": 3.171857327673526e-05, + "loss": 2.9275, + "step": 3528500 + }, + { + "epoch": 1.0970410356763713, + "grad_norm": 12.1392822265625, + "learning_rate": 3.1715982738727144e-05, + "loss": 2.9374, + "step": 3529000 + }, + { + "epoch": 1.0971964679568582, + "grad_norm": 13.668643951416016, + "learning_rate": 3.171339220071903e-05, + "loss": 2.9078, + "step": 3529500 + }, + { + "epoch": 1.097351900237345, + "grad_norm": 6.475322723388672, + "learning_rate": 3.171080166271092e-05, + "loss": 2.9295, + "step": 3530000 + }, + { + "epoch": 1.097507332517832, + "grad_norm": 5.314282417297363, + "learning_rate": 3.1708211124702805e-05, + "loss": 2.8604, + "step": 3530500 + }, + { + "epoch": 1.0976627647983188, + "grad_norm": 12.387959480285645, + "learning_rate": 3.170562058669469e-05, + "loss": 2.8758, + "step": 3531000 + }, + { + "epoch": 1.0978181970788057, + "grad_norm": 9.478181838989258, + "learning_rate": 3.170303004868657e-05, + "loss": 2.9222, + "step": 3531500 + }, + { + "epoch": 1.0979736293592925, + "grad_norm": 11.93160343170166, + "learning_rate": 3.170043951067846e-05, + "loss": 2.9708, + "step": 3532000 + }, + { + "epoch": 1.0981290616397794, + "grad_norm": 9.69024658203125, + "learning_rate": 3.169784897267034e-05, + "loss": 2.8723, + "step": 3532500 + }, + { + "epoch": 1.0982844939202663, + "grad_norm": 17.672365188598633, + "learning_rate": 3.169525843466223e-05, + "loss": 2.9092, + "step": 3533000 + }, + { + "epoch": 1.0984399262007531, + "grad_norm": 8.393465995788574, + "learning_rate": 3.1692667896654115e-05, + "loss": 2.8323, + "step": 3533500 + }, + { + "epoch": 1.09859535848124, + "grad_norm": 32.73771286010742, + "learning_rate": 3.1690077358645995e-05, + "loss": 2.9106, + "step": 3534000 + }, + { + "epoch": 1.0987507907617269, + "grad_norm": 54.18165588378906, + "learning_rate": 3.168748682063788e-05, + "loss": 2.923, + "step": 3534500 + }, + { + "epoch": 1.098906223042214, + "grad_norm": 15.222344398498535, + "learning_rate": 3.168489628262977e-05, + "loss": 2.9447, + "step": 3535000 + }, + { + "epoch": 1.0990616553227008, + "grad_norm": 9.96242618560791, + "learning_rate": 3.1682305744621657e-05, + "loss": 2.9369, + "step": 3535500 + }, + { + "epoch": 1.0992170876031877, + "grad_norm": 16.911544799804688, + "learning_rate": 3.1679715206613544e-05, + "loss": 2.8573, + "step": 3536000 + }, + { + "epoch": 1.0993725198836746, + "grad_norm": 22.252477645874023, + "learning_rate": 3.167712466860543e-05, + "loss": 2.8741, + "step": 3536500 + }, + { + "epoch": 1.0995279521641614, + "grad_norm": 16.954389572143555, + "learning_rate": 3.167453413059731e-05, + "loss": 2.8908, + "step": 3537000 + }, + { + "epoch": 1.0996833844446483, + "grad_norm": 9.266519546508789, + "learning_rate": 3.16719435925892e-05, + "loss": 2.9192, + "step": 3537500 + }, + { + "epoch": 1.0998388167251352, + "grad_norm": 6.428283214569092, + "learning_rate": 3.1669353054581085e-05, + "loss": 2.9071, + "step": 3538000 + }, + { + "epoch": 1.099994249005622, + "grad_norm": 8.585175514221191, + "learning_rate": 3.1666762516572966e-05, + "loss": 2.9184, + "step": 3538500 + }, + { + "epoch": 1.100149681286109, + "grad_norm": 9.629717826843262, + "learning_rate": 3.166417197856485e-05, + "loss": 2.9217, + "step": 3539000 + }, + { + "epoch": 1.1003051135665958, + "grad_norm": 8.230830192565918, + "learning_rate": 3.166158144055674e-05, + "loss": 2.9174, + "step": 3539500 + }, + { + "epoch": 1.1004605458470826, + "grad_norm": 12.244489669799805, + "learning_rate": 3.165899090254863e-05, + "loss": 2.8847, + "step": 3540000 + }, + { + "epoch": 1.1006159781275695, + "grad_norm": 51.849849700927734, + "learning_rate": 3.1656400364540514e-05, + "loss": 2.8505, + "step": 3540500 + }, + { + "epoch": 1.1007714104080564, + "grad_norm": 10.171134948730469, + "learning_rate": 3.1653809826532395e-05, + "loss": 2.885, + "step": 3541000 + }, + { + "epoch": 1.1009268426885432, + "grad_norm": 8.765802383422852, + "learning_rate": 3.165121928852428e-05, + "loss": 2.8935, + "step": 3541500 + }, + { + "epoch": 1.10108227496903, + "grad_norm": 9.121342658996582, + "learning_rate": 3.164862875051617e-05, + "loss": 2.8771, + "step": 3542000 + }, + { + "epoch": 1.101237707249517, + "grad_norm": 8.958356857299805, + "learning_rate": 3.164603821250805e-05, + "loss": 2.8754, + "step": 3542500 + }, + { + "epoch": 1.1013931395300038, + "grad_norm": 8.786057472229004, + "learning_rate": 3.164344767449994e-05, + "loss": 2.9113, + "step": 3543000 + }, + { + "epoch": 1.1015485718104907, + "grad_norm": 6.58328914642334, + "learning_rate": 3.1640857136491824e-05, + "loss": 2.9021, + "step": 3543500 + }, + { + "epoch": 1.1017040040909776, + "grad_norm": 8.737730026245117, + "learning_rate": 3.1638266598483704e-05, + "loss": 2.9052, + "step": 3544000 + }, + { + "epoch": 1.1018594363714644, + "grad_norm": 10.407540321350098, + "learning_rate": 3.163567606047559e-05, + "loss": 2.9228, + "step": 3544500 + }, + { + "epoch": 1.1020148686519513, + "grad_norm": 9.168575286865234, + "learning_rate": 3.163308552246748e-05, + "loss": 2.9385, + "step": 3545000 + }, + { + "epoch": 1.1021703009324382, + "grad_norm": 14.582667350769043, + "learning_rate": 3.1630494984459366e-05, + "loss": 2.9336, + "step": 3545500 + }, + { + "epoch": 1.102325733212925, + "grad_norm": 8.823845863342285, + "learning_rate": 3.162790444645125e-05, + "loss": 2.8903, + "step": 3546000 + }, + { + "epoch": 1.102481165493412, + "grad_norm": 9.595157623291016, + "learning_rate": 3.162531390844313e-05, + "loss": 2.9, + "step": 3546500 + }, + { + "epoch": 1.1026365977738988, + "grad_norm": 9.19929313659668, + "learning_rate": 3.162272337043502e-05, + "loss": 2.937, + "step": 3547000 + }, + { + "epoch": 1.1027920300543856, + "grad_norm": 9.986681938171387, + "learning_rate": 3.162013283242691e-05, + "loss": 2.9671, + "step": 3547500 + }, + { + "epoch": 1.1029474623348727, + "grad_norm": 9.130780220031738, + "learning_rate": 3.161754229441879e-05, + "loss": 2.8895, + "step": 3548000 + }, + { + "epoch": 1.1031028946153596, + "grad_norm": 13.4612398147583, + "learning_rate": 3.1614951756410675e-05, + "loss": 2.8391, + "step": 3548500 + }, + { + "epoch": 1.1032583268958465, + "grad_norm": 7.933062553405762, + "learning_rate": 3.161236121840256e-05, + "loss": 2.9192, + "step": 3549000 + }, + { + "epoch": 1.1034137591763333, + "grad_norm": 7.947417736053467, + "learning_rate": 3.160977068039445e-05, + "loss": 2.8877, + "step": 3549500 + }, + { + "epoch": 1.1035691914568202, + "grad_norm": 13.77512264251709, + "learning_rate": 3.1607180142386337e-05, + "loss": 2.9156, + "step": 3550000 + }, + { + "epoch": 1.103724623737307, + "grad_norm": 8.778081893920898, + "learning_rate": 3.160458960437822e-05, + "loss": 2.9278, + "step": 3550500 + }, + { + "epoch": 1.103880056017794, + "grad_norm": 10.007248878479004, + "learning_rate": 3.1601999066370104e-05, + "loss": 2.9803, + "step": 3551000 + }, + { + "epoch": 1.1040354882982808, + "grad_norm": 12.817951202392578, + "learning_rate": 3.159940852836199e-05, + "loss": 2.9381, + "step": 3551500 + }, + { + "epoch": 1.1041909205787677, + "grad_norm": 10.427268028259277, + "learning_rate": 3.159681799035387e-05, + "loss": 2.8748, + "step": 3552000 + }, + { + "epoch": 1.1043463528592545, + "grad_norm": 11.906009674072266, + "learning_rate": 3.159422745234576e-05, + "loss": 2.9544, + "step": 3552500 + }, + { + "epoch": 1.1045017851397414, + "grad_norm": 203.08099365234375, + "learning_rate": 3.1591636914337646e-05, + "loss": 2.9357, + "step": 3553000 + }, + { + "epoch": 1.1046572174202283, + "grad_norm": 9.844895362854004, + "learning_rate": 3.1589046376329526e-05, + "loss": 2.9115, + "step": 3553500 + }, + { + "epoch": 1.1048126497007151, + "grad_norm": 9.064706802368164, + "learning_rate": 3.1586455838321413e-05, + "loss": 2.9285, + "step": 3554000 + }, + { + "epoch": 1.104968081981202, + "grad_norm": 7.522736072540283, + "learning_rate": 3.15838653003133e-05, + "loss": 2.8993, + "step": 3554500 + }, + { + "epoch": 1.1051235142616889, + "grad_norm": 12.092110633850098, + "learning_rate": 3.158127476230519e-05, + "loss": 2.9245, + "step": 3555000 + }, + { + "epoch": 1.1052789465421757, + "grad_norm": 9.743263244628906, + "learning_rate": 3.1578684224297075e-05, + "loss": 2.923, + "step": 3555500 + }, + { + "epoch": 1.1054343788226626, + "grad_norm": 5.530144691467285, + "learning_rate": 3.157609368628896e-05, + "loss": 2.9704, + "step": 3556000 + }, + { + "epoch": 1.1055898111031495, + "grad_norm": 6.32913064956665, + "learning_rate": 3.157350314828084e-05, + "loss": 2.9738, + "step": 3556500 + }, + { + "epoch": 1.1057452433836363, + "grad_norm": 9.774497032165527, + "learning_rate": 3.157091261027273e-05, + "loss": 2.9213, + "step": 3557000 + }, + { + "epoch": 1.1059006756641232, + "grad_norm": 11.13141918182373, + "learning_rate": 3.156832207226461e-05, + "loss": 2.9371, + "step": 3557500 + }, + { + "epoch": 1.10605610794461, + "grad_norm": 18.291475296020508, + "learning_rate": 3.15657315342565e-05, + "loss": 2.9304, + "step": 3558000 + }, + { + "epoch": 1.106211540225097, + "grad_norm": 8.945637702941895, + "learning_rate": 3.1563140996248384e-05, + "loss": 2.9643, + "step": 3558500 + }, + { + "epoch": 1.1063669725055838, + "grad_norm": 12.730694770812988, + "learning_rate": 3.1560550458240265e-05, + "loss": 2.9324, + "step": 3559000 + }, + { + "epoch": 1.106522404786071, + "grad_norm": 6.8285322189331055, + "learning_rate": 3.155795992023216e-05, + "loss": 2.9237, + "step": 3559500 + }, + { + "epoch": 1.1066778370665578, + "grad_norm": 8.351716041564941, + "learning_rate": 3.1555369382224046e-05, + "loss": 2.9045, + "step": 3560000 + }, + { + "epoch": 1.1068332693470446, + "grad_norm": 9.797928810119629, + "learning_rate": 3.1552778844215926e-05, + "loss": 2.893, + "step": 3560500 + }, + { + "epoch": 1.1069887016275315, + "grad_norm": 14.481667518615723, + "learning_rate": 3.155018830620781e-05, + "loss": 2.8561, + "step": 3561000 + }, + { + "epoch": 1.1071441339080184, + "grad_norm": 9.141870498657227, + "learning_rate": 3.15475977681997e-05, + "loss": 2.8812, + "step": 3561500 + }, + { + "epoch": 1.1072995661885052, + "grad_norm": 8.43124008178711, + "learning_rate": 3.154500723019158e-05, + "loss": 2.8442, + "step": 3562000 + }, + { + "epoch": 1.107454998468992, + "grad_norm": 8.834332466125488, + "learning_rate": 3.154241669218347e-05, + "loss": 2.8307, + "step": 3562500 + }, + { + "epoch": 1.107610430749479, + "grad_norm": 10.177386283874512, + "learning_rate": 3.153982615417535e-05, + "loss": 2.888, + "step": 3563000 + }, + { + "epoch": 1.1077658630299658, + "grad_norm": 9.548042297363281, + "learning_rate": 3.1537235616167235e-05, + "loss": 2.8614, + "step": 3563500 + }, + { + "epoch": 1.1079212953104527, + "grad_norm": 9.611930847167969, + "learning_rate": 3.153464507815912e-05, + "loss": 2.9166, + "step": 3564000 + }, + { + "epoch": 1.1080767275909396, + "grad_norm": 9.221977233886719, + "learning_rate": 3.153205454015101e-05, + "loss": 2.9487, + "step": 3564500 + }, + { + "epoch": 1.1082321598714264, + "grad_norm": 9.041449546813965, + "learning_rate": 3.15294640021429e-05, + "loss": 2.8598, + "step": 3565000 + }, + { + "epoch": 1.1083875921519133, + "grad_norm": 10.16914176940918, + "learning_rate": 3.1526873464134784e-05, + "loss": 2.9014, + "step": 3565500 + }, + { + "epoch": 1.1085430244324002, + "grad_norm": 8.613224029541016, + "learning_rate": 3.1524282926126664e-05, + "loss": 2.884, + "step": 3566000 + }, + { + "epoch": 1.108698456712887, + "grad_norm": 14.527907371520996, + "learning_rate": 3.152169238811855e-05, + "loss": 2.8888, + "step": 3566500 + }, + { + "epoch": 1.108853888993374, + "grad_norm": 16.284883499145508, + "learning_rate": 3.151910185011044e-05, + "loss": 2.9113, + "step": 3567000 + }, + { + "epoch": 1.1090093212738608, + "grad_norm": 8.801419258117676, + "learning_rate": 3.151651131210232e-05, + "loss": 2.8724, + "step": 3567500 + }, + { + "epoch": 1.1091647535543476, + "grad_norm": 8.312487602233887, + "learning_rate": 3.1513920774094206e-05, + "loss": 2.8892, + "step": 3568000 + }, + { + "epoch": 1.1093201858348345, + "grad_norm": 10.177582740783691, + "learning_rate": 3.151133023608609e-05, + "loss": 2.9072, + "step": 3568500 + }, + { + "epoch": 1.1094756181153214, + "grad_norm": 8.768697738647461, + "learning_rate": 3.1508739698077974e-05, + "loss": 2.8844, + "step": 3569000 + }, + { + "epoch": 1.1096310503958082, + "grad_norm": 12.46141529083252, + "learning_rate": 3.150614916006987e-05, + "loss": 2.9755, + "step": 3569500 + }, + { + "epoch": 1.109786482676295, + "grad_norm": 17.361204147338867, + "learning_rate": 3.150355862206175e-05, + "loss": 2.9643, + "step": 3570000 + }, + { + "epoch": 1.109941914956782, + "grad_norm": 9.45152759552002, + "learning_rate": 3.1500968084053635e-05, + "loss": 2.8816, + "step": 3570500 + }, + { + "epoch": 1.1100973472372688, + "grad_norm": 11.308121681213379, + "learning_rate": 3.149837754604552e-05, + "loss": 2.9244, + "step": 3571000 + }, + { + "epoch": 1.1102527795177557, + "grad_norm": 10.524062156677246, + "learning_rate": 3.14957870080374e-05, + "loss": 2.9005, + "step": 3571500 + }, + { + "epoch": 1.1104082117982428, + "grad_norm": 11.259439468383789, + "learning_rate": 3.149319647002929e-05, + "loss": 2.8999, + "step": 3572000 + }, + { + "epoch": 1.1105636440787297, + "grad_norm": 43.83205032348633, + "learning_rate": 3.149060593202118e-05, + "loss": 2.9, + "step": 3572500 + }, + { + "epoch": 1.1107190763592165, + "grad_norm": 16.55241584777832, + "learning_rate": 3.148801539401306e-05, + "loss": 2.854, + "step": 3573000 + }, + { + "epoch": 1.1108745086397034, + "grad_norm": 8.159965515136719, + "learning_rate": 3.1485424856004945e-05, + "loss": 2.9686, + "step": 3573500 + }, + { + "epoch": 1.1110299409201903, + "grad_norm": 11.18523120880127, + "learning_rate": 3.148283431799683e-05, + "loss": 2.9227, + "step": 3574000 + }, + { + "epoch": 1.1111853732006771, + "grad_norm": 7.698137283325195, + "learning_rate": 3.148024377998872e-05, + "loss": 2.9029, + "step": 3574500 + }, + { + "epoch": 1.111340805481164, + "grad_norm": 13.422443389892578, + "learning_rate": 3.1477653241980606e-05, + "loss": 2.9112, + "step": 3575000 + }, + { + "epoch": 1.1114962377616509, + "grad_norm": 9.778125762939453, + "learning_rate": 3.1475062703972486e-05, + "loss": 2.9024, + "step": 3575500 + }, + { + "epoch": 1.1116516700421377, + "grad_norm": 8.26839542388916, + "learning_rate": 3.1472472165964374e-05, + "loss": 2.8955, + "step": 3576000 + }, + { + "epoch": 1.1118071023226246, + "grad_norm": 11.457314491271973, + "learning_rate": 3.146988162795626e-05, + "loss": 2.876, + "step": 3576500 + }, + { + "epoch": 1.1119625346031115, + "grad_norm": 6.479464054107666, + "learning_rate": 3.146729108994814e-05, + "loss": 2.8628, + "step": 3577000 + }, + { + "epoch": 1.1121179668835983, + "grad_norm": 8.918859481811523, + "learning_rate": 3.146470055194003e-05, + "loss": 2.8767, + "step": 3577500 + }, + { + "epoch": 1.1122733991640852, + "grad_norm": 9.258519172668457, + "learning_rate": 3.1462110013931915e-05, + "loss": 2.8875, + "step": 3578000 + }, + { + "epoch": 1.112428831444572, + "grad_norm": 30.81430435180664, + "learning_rate": 3.1459519475923796e-05, + "loss": 2.9095, + "step": 3578500 + }, + { + "epoch": 1.112584263725059, + "grad_norm": 10.30754566192627, + "learning_rate": 3.145692893791568e-05, + "loss": 2.89, + "step": 3579000 + }, + { + "epoch": 1.1127396960055458, + "grad_norm": 43.936126708984375, + "learning_rate": 3.145433839990758e-05, + "loss": 2.8926, + "step": 3579500 + }, + { + "epoch": 1.1128951282860327, + "grad_norm": 9.7268648147583, + "learning_rate": 3.145174786189946e-05, + "loss": 2.8668, + "step": 3580000 + }, + { + "epoch": 1.1130505605665195, + "grad_norm": 9.729190826416016, + "learning_rate": 3.1449157323891344e-05, + "loss": 2.8726, + "step": 3580500 + }, + { + "epoch": 1.1132059928470064, + "grad_norm": 11.569934844970703, + "learning_rate": 3.1446566785883225e-05, + "loss": 2.9214, + "step": 3581000 + }, + { + "epoch": 1.1133614251274933, + "grad_norm": 7.513211250305176, + "learning_rate": 3.144397624787511e-05, + "loss": 2.9003, + "step": 3581500 + }, + { + "epoch": 1.1135168574079801, + "grad_norm": 9.057258605957031, + "learning_rate": 3.1441385709867e-05, + "loss": 2.8943, + "step": 3582000 + }, + { + "epoch": 1.113672289688467, + "grad_norm": 12.20251750946045, + "learning_rate": 3.143879517185888e-05, + "loss": 2.8698, + "step": 3582500 + }, + { + "epoch": 1.1138277219689539, + "grad_norm": 9.641168594360352, + "learning_rate": 3.143620463385077e-05, + "loss": 2.8751, + "step": 3583000 + }, + { + "epoch": 1.113983154249441, + "grad_norm": 9.028679847717285, + "learning_rate": 3.1433614095842654e-05, + "loss": 2.9046, + "step": 3583500 + }, + { + "epoch": 1.1141385865299278, + "grad_norm": 8.836004257202148, + "learning_rate": 3.143102355783454e-05, + "loss": 2.9418, + "step": 3584000 + }, + { + "epoch": 1.1142940188104147, + "grad_norm": 10.63279914855957, + "learning_rate": 3.142843301982643e-05, + "loss": 2.9472, + "step": 3584500 + }, + { + "epoch": 1.1144494510909015, + "grad_norm": 9.479414939880371, + "learning_rate": 3.1425842481818315e-05, + "loss": 2.8997, + "step": 3585000 + }, + { + "epoch": 1.1146048833713884, + "grad_norm": 12.9554443359375, + "learning_rate": 3.1423251943810196e-05, + "loss": 2.8736, + "step": 3585500 + }, + { + "epoch": 1.1147603156518753, + "grad_norm": 9.292349815368652, + "learning_rate": 3.142066140580208e-05, + "loss": 2.8949, + "step": 3586000 + }, + { + "epoch": 1.1149157479323621, + "grad_norm": 245.16232299804688, + "learning_rate": 3.141807086779397e-05, + "loss": 2.8551, + "step": 3586500 + }, + { + "epoch": 1.115071180212849, + "grad_norm": 9.274307250976562, + "learning_rate": 3.141548032978585e-05, + "loss": 2.8807, + "step": 3587000 + }, + { + "epoch": 1.1152266124933359, + "grad_norm": 8.88189697265625, + "learning_rate": 3.141288979177774e-05, + "loss": 2.908, + "step": 3587500 + }, + { + "epoch": 1.1153820447738227, + "grad_norm": 8.11458683013916, + "learning_rate": 3.141029925376962e-05, + "loss": 2.9292, + "step": 3588000 + }, + { + "epoch": 1.1155374770543096, + "grad_norm": 6.507795333862305, + "learning_rate": 3.1407708715761505e-05, + "loss": 2.8455, + "step": 3588500 + }, + { + "epoch": 1.1156929093347965, + "grad_norm": 9.513267517089844, + "learning_rate": 3.140511817775339e-05, + "loss": 2.8841, + "step": 3589000 + }, + { + "epoch": 1.1158483416152833, + "grad_norm": 11.145048141479492, + "learning_rate": 3.140252763974528e-05, + "loss": 2.8864, + "step": 3589500 + }, + { + "epoch": 1.1160037738957702, + "grad_norm": 9.887998580932617, + "learning_rate": 3.1399937101737166e-05, + "loss": 2.9171, + "step": 3590000 + }, + { + "epoch": 1.116159206176257, + "grad_norm": 8.122203826904297, + "learning_rate": 3.1397346563729054e-05, + "loss": 2.9295, + "step": 3590500 + }, + { + "epoch": 1.116314638456744, + "grad_norm": 10.195655822753906, + "learning_rate": 3.1394756025720934e-05, + "loss": 2.8902, + "step": 3591000 + }, + { + "epoch": 1.1164700707372308, + "grad_norm": 142.2107391357422, + "learning_rate": 3.139216548771282e-05, + "loss": 2.9536, + "step": 3591500 + }, + { + "epoch": 1.1166255030177177, + "grad_norm": 26.69980812072754, + "learning_rate": 3.138957494970471e-05, + "loss": 2.8901, + "step": 3592000 + }, + { + "epoch": 1.1167809352982045, + "grad_norm": 5.913687229156494, + "learning_rate": 3.138698441169659e-05, + "loss": 2.8959, + "step": 3592500 + }, + { + "epoch": 1.1169363675786914, + "grad_norm": 10.815984725952148, + "learning_rate": 3.1384393873688476e-05, + "loss": 2.9066, + "step": 3593000 + }, + { + "epoch": 1.1170917998591783, + "grad_norm": 10.666102409362793, + "learning_rate": 3.138180333568036e-05, + "loss": 2.9, + "step": 3593500 + }, + { + "epoch": 1.1172472321396651, + "grad_norm": 8.287445068359375, + "learning_rate": 3.137921279767225e-05, + "loss": 2.9493, + "step": 3594000 + }, + { + "epoch": 1.117402664420152, + "grad_norm": 10.190104484558105, + "learning_rate": 3.137662225966414e-05, + "loss": 3.0244, + "step": 3594500 + }, + { + "epoch": 1.1175580967006389, + "grad_norm": 10.543582916259766, + "learning_rate": 3.137403172165602e-05, + "loss": 2.8868, + "step": 3595000 + }, + { + "epoch": 1.1177135289811257, + "grad_norm": 9.451276779174805, + "learning_rate": 3.1371441183647905e-05, + "loss": 2.8461, + "step": 3595500 + }, + { + "epoch": 1.1178689612616128, + "grad_norm": 45.22929000854492, + "learning_rate": 3.136885064563979e-05, + "loss": 2.9246, + "step": 3596000 + }, + { + "epoch": 1.1180243935420997, + "grad_norm": 8.725290298461914, + "learning_rate": 3.136626010763167e-05, + "loss": 2.8705, + "step": 3596500 + }, + { + "epoch": 1.1181798258225866, + "grad_norm": 6.0675201416015625, + "learning_rate": 3.136366956962356e-05, + "loss": 2.9124, + "step": 3597000 + }, + { + "epoch": 1.1183352581030734, + "grad_norm": 22.656761169433594, + "learning_rate": 3.136107903161545e-05, + "loss": 2.8969, + "step": 3597500 + }, + { + "epoch": 1.1184906903835603, + "grad_norm": 7.844944477081299, + "learning_rate": 3.135848849360733e-05, + "loss": 2.8791, + "step": 3598000 + }, + { + "epoch": 1.1186461226640472, + "grad_norm": 12.69201374053955, + "learning_rate": 3.1355897955599214e-05, + "loss": 2.9571, + "step": 3598500 + }, + { + "epoch": 1.118801554944534, + "grad_norm": 10.467672348022461, + "learning_rate": 3.13533074175911e-05, + "loss": 2.9202, + "step": 3599000 + }, + { + "epoch": 1.118956987225021, + "grad_norm": 10.250481605529785, + "learning_rate": 3.135071687958299e-05, + "loss": 2.9123, + "step": 3599500 + }, + { + "epoch": 1.1191124195055078, + "grad_norm": 9.939593315124512, + "learning_rate": 3.1348126341574876e-05, + "loss": 2.8868, + "step": 3600000 + }, + { + "epoch": 1.1192678517859946, + "grad_norm": 12.054964065551758, + "learning_rate": 3.1345535803566756e-05, + "loss": 2.8585, + "step": 3600500 + }, + { + "epoch": 1.1194232840664815, + "grad_norm": 8.749605178833008, + "learning_rate": 3.134294526555864e-05, + "loss": 2.8405, + "step": 3601000 + }, + { + "epoch": 1.1195787163469684, + "grad_norm": 7.97458553314209, + "learning_rate": 3.134035472755053e-05, + "loss": 2.8541, + "step": 3601500 + }, + { + "epoch": 1.1197341486274552, + "grad_norm": 10.157072067260742, + "learning_rate": 3.133776418954241e-05, + "loss": 2.871, + "step": 3602000 + }, + { + "epoch": 1.119889580907942, + "grad_norm": 10.588748931884766, + "learning_rate": 3.13351736515343e-05, + "loss": 2.8664, + "step": 3602500 + }, + { + "epoch": 1.120045013188429, + "grad_norm": 9.43729019165039, + "learning_rate": 3.1332583113526185e-05, + "loss": 2.8534, + "step": 3603000 + }, + { + "epoch": 1.1202004454689158, + "grad_norm": 8.12104606628418, + "learning_rate": 3.132999257551807e-05, + "loss": 2.9248, + "step": 3603500 + }, + { + "epoch": 1.1203558777494027, + "grad_norm": 9.23872184753418, + "learning_rate": 3.132740203750996e-05, + "loss": 2.9388, + "step": 3604000 + }, + { + "epoch": 1.1205113100298896, + "grad_norm": 15.155474662780762, + "learning_rate": 3.1324811499501847e-05, + "loss": 2.9153, + "step": 3604500 + }, + { + "epoch": 1.1206667423103764, + "grad_norm": 22.338665008544922, + "learning_rate": 3.132222096149373e-05, + "loss": 2.8895, + "step": 3605000 + }, + { + "epoch": 1.1208221745908633, + "grad_norm": 9.353049278259277, + "learning_rate": 3.1319630423485614e-05, + "loss": 2.932, + "step": 3605500 + }, + { + "epoch": 1.1209776068713502, + "grad_norm": 9.235610961914062, + "learning_rate": 3.1317039885477494e-05, + "loss": 2.878, + "step": 3606000 + }, + { + "epoch": 1.121133039151837, + "grad_norm": 6.553627014160156, + "learning_rate": 3.131444934746938e-05, + "loss": 2.953, + "step": 3606500 + }, + { + "epoch": 1.121288471432324, + "grad_norm": 11.544050216674805, + "learning_rate": 3.131185880946127e-05, + "loss": 2.873, + "step": 3607000 + }, + { + "epoch": 1.121443903712811, + "grad_norm": 7.3136162757873535, + "learning_rate": 3.130926827145315e-05, + "loss": 2.866, + "step": 3607500 + }, + { + "epoch": 1.1215993359932979, + "grad_norm": 11.399698257446289, + "learning_rate": 3.1306677733445036e-05, + "loss": 2.8662, + "step": 3608000 + }, + { + "epoch": 1.1217547682737847, + "grad_norm": 11.642143249511719, + "learning_rate": 3.1304087195436923e-05, + "loss": 2.8589, + "step": 3608500 + }, + { + "epoch": 1.1219102005542716, + "grad_norm": 8.747879028320312, + "learning_rate": 3.130149665742881e-05, + "loss": 2.8444, + "step": 3609000 + }, + { + "epoch": 1.1220656328347585, + "grad_norm": 8.542929649353027, + "learning_rate": 3.12989061194207e-05, + "loss": 2.8131, + "step": 3609500 + }, + { + "epoch": 1.1222210651152453, + "grad_norm": 11.269373893737793, + "learning_rate": 3.1296315581412585e-05, + "loss": 2.8528, + "step": 3610000 + }, + { + "epoch": 1.1223764973957322, + "grad_norm": 14.40009880065918, + "learning_rate": 3.1293725043404465e-05, + "loss": 2.8223, + "step": 3610500 + }, + { + "epoch": 1.122531929676219, + "grad_norm": 9.094355583190918, + "learning_rate": 3.129113450539635e-05, + "loss": 2.8593, + "step": 3611000 + }, + { + "epoch": 1.122687361956706, + "grad_norm": 9.168343544006348, + "learning_rate": 3.128854396738823e-05, + "loss": 2.8976, + "step": 3611500 + }, + { + "epoch": 1.1228427942371928, + "grad_norm": 19.050682067871094, + "learning_rate": 3.128595342938012e-05, + "loss": 2.8543, + "step": 3612000 + }, + { + "epoch": 1.1229982265176797, + "grad_norm": 11.184464454650879, + "learning_rate": 3.128336289137201e-05, + "loss": 2.8484, + "step": 3612500 + }, + { + "epoch": 1.1231536587981665, + "grad_norm": 9.164830207824707, + "learning_rate": 3.128077235336389e-05, + "loss": 2.8289, + "step": 3613000 + }, + { + "epoch": 1.1233090910786534, + "grad_norm": 11.32863998413086, + "learning_rate": 3.127818181535578e-05, + "loss": 2.9469, + "step": 3613500 + }, + { + "epoch": 1.1234645233591403, + "grad_norm": 20.609378814697266, + "learning_rate": 3.127559127734767e-05, + "loss": 2.8613, + "step": 3614000 + }, + { + "epoch": 1.1236199556396271, + "grad_norm": 16.378660202026367, + "learning_rate": 3.127300073933955e-05, + "loss": 2.8759, + "step": 3614500 + }, + { + "epoch": 1.123775387920114, + "grad_norm": 12.649703979492188, + "learning_rate": 3.1270410201331436e-05, + "loss": 2.8521, + "step": 3615000 + }, + { + "epoch": 1.1239308202006009, + "grad_norm": 8.559351921081543, + "learning_rate": 3.126781966332332e-05, + "loss": 2.8512, + "step": 3615500 + }, + { + "epoch": 1.1240862524810877, + "grad_norm": 10.124994277954102, + "learning_rate": 3.1265229125315204e-05, + "loss": 2.8987, + "step": 3616000 + }, + { + "epoch": 1.1242416847615746, + "grad_norm": 10.391813278198242, + "learning_rate": 3.126263858730709e-05, + "loss": 2.9502, + "step": 3616500 + }, + { + "epoch": 1.1243971170420615, + "grad_norm": 24.618831634521484, + "learning_rate": 3.126004804929897e-05, + "loss": 2.9469, + "step": 3617000 + }, + { + "epoch": 1.1245525493225483, + "grad_norm": 9.504499435424805, + "learning_rate": 3.125745751129086e-05, + "loss": 2.9543, + "step": 3617500 + }, + { + "epoch": 1.1247079816030352, + "grad_norm": 17.00250244140625, + "learning_rate": 3.1254866973282745e-05, + "loss": 2.8996, + "step": 3618000 + }, + { + "epoch": 1.124863413883522, + "grad_norm": 9.753799438476562, + "learning_rate": 3.125227643527463e-05, + "loss": 2.8382, + "step": 3618500 + }, + { + "epoch": 1.125018846164009, + "grad_norm": 11.93874454498291, + "learning_rate": 3.124968589726652e-05, + "loss": 2.8697, + "step": 3619000 + }, + { + "epoch": 1.1251742784444958, + "grad_norm": 8.691497802734375, + "learning_rate": 3.124709535925841e-05, + "loss": 2.9431, + "step": 3619500 + }, + { + "epoch": 1.1253297107249827, + "grad_norm": 9.9661226272583, + "learning_rate": 3.124450482125029e-05, + "loss": 2.8803, + "step": 3620000 + }, + { + "epoch": 1.1254851430054698, + "grad_norm": 12.05944538116455, + "learning_rate": 3.1241914283242174e-05, + "loss": 2.8724, + "step": 3620500 + }, + { + "epoch": 1.1256405752859566, + "grad_norm": 8.947443962097168, + "learning_rate": 3.123932374523406e-05, + "loss": 2.8599, + "step": 3621000 + }, + { + "epoch": 1.1257960075664435, + "grad_norm": 13.420621871948242, + "learning_rate": 3.123673320722594e-05, + "loss": 2.9068, + "step": 3621500 + }, + { + "epoch": 1.1259514398469304, + "grad_norm": 14.962555885314941, + "learning_rate": 3.123414266921783e-05, + "loss": 2.8576, + "step": 3622000 + }, + { + "epoch": 1.1261068721274172, + "grad_norm": 7.483392715454102, + "learning_rate": 3.1231552131209716e-05, + "loss": 2.8932, + "step": 3622500 + }, + { + "epoch": 1.126262304407904, + "grad_norm": 12.250661849975586, + "learning_rate": 3.12289615932016e-05, + "loss": 2.848, + "step": 3623000 + }, + { + "epoch": 1.126417736688391, + "grad_norm": 9.149162292480469, + "learning_rate": 3.122637105519349e-05, + "loss": 2.8681, + "step": 3623500 + }, + { + "epoch": 1.1265731689688778, + "grad_norm": 10.374054908752441, + "learning_rate": 3.122378051718537e-05, + "loss": 2.9048, + "step": 3624000 + }, + { + "epoch": 1.1267286012493647, + "grad_norm": 9.406096458435059, + "learning_rate": 3.122118997917726e-05, + "loss": 2.8882, + "step": 3624500 + }, + { + "epoch": 1.1268840335298516, + "grad_norm": 12.307018280029297, + "learning_rate": 3.1218599441169145e-05, + "loss": 2.8851, + "step": 3625000 + }, + { + "epoch": 1.1270394658103384, + "grad_norm": 8.850298881530762, + "learning_rate": 3.1216008903161026e-05, + "loss": 2.9153, + "step": 3625500 + }, + { + "epoch": 1.1271948980908253, + "grad_norm": 9.435564041137695, + "learning_rate": 3.121341836515291e-05, + "loss": 2.9027, + "step": 3626000 + }, + { + "epoch": 1.1273503303713122, + "grad_norm": 7.549445152282715, + "learning_rate": 3.12108278271448e-05, + "loss": 2.8369, + "step": 3626500 + }, + { + "epoch": 1.127505762651799, + "grad_norm": 6.669352054595947, + "learning_rate": 3.120823728913668e-05, + "loss": 2.8976, + "step": 3627000 + }, + { + "epoch": 1.127661194932286, + "grad_norm": 40.28005599975586, + "learning_rate": 3.120564675112857e-05, + "loss": 2.8833, + "step": 3627500 + }, + { + "epoch": 1.1278166272127728, + "grad_norm": 20.85923957824707, + "learning_rate": 3.1203056213120455e-05, + "loss": 2.8855, + "step": 3628000 + }, + { + "epoch": 1.1279720594932596, + "grad_norm": 10.092071533203125, + "learning_rate": 3.120046567511234e-05, + "loss": 2.8826, + "step": 3628500 + }, + { + "epoch": 1.1281274917737465, + "grad_norm": 24.54018211364746, + "learning_rate": 3.119787513710423e-05, + "loss": 2.9161, + "step": 3629000 + }, + { + "epoch": 1.1282829240542334, + "grad_norm": 28.847183227539062, + "learning_rate": 3.119528459909611e-05, + "loss": 2.8999, + "step": 3629500 + }, + { + "epoch": 1.1284383563347202, + "grad_norm": 13.132366180419922, + "learning_rate": 3.1192694061087996e-05, + "loss": 2.8597, + "step": 3630000 + }, + { + "epoch": 1.128593788615207, + "grad_norm": 12.342761039733887, + "learning_rate": 3.1190103523079884e-05, + "loss": 2.9069, + "step": 3630500 + }, + { + "epoch": 1.1287492208956942, + "grad_norm": 9.35662841796875, + "learning_rate": 3.1187512985071764e-05, + "loss": 2.8965, + "step": 3631000 + }, + { + "epoch": 1.128904653176181, + "grad_norm": 8.725631713867188, + "learning_rate": 3.118492244706365e-05, + "loss": 2.8525, + "step": 3631500 + }, + { + "epoch": 1.129060085456668, + "grad_norm": 7.505669593811035, + "learning_rate": 3.118233190905554e-05, + "loss": 2.8804, + "step": 3632000 + }, + { + "epoch": 1.1292155177371548, + "grad_norm": 16.98729133605957, + "learning_rate": 3.117974137104742e-05, + "loss": 2.8688, + "step": 3632500 + }, + { + "epoch": 1.1293709500176417, + "grad_norm": 8.032146453857422, + "learning_rate": 3.1177150833039306e-05, + "loss": 2.8808, + "step": 3633000 + }, + { + "epoch": 1.1295263822981285, + "grad_norm": 8.60189437866211, + "learning_rate": 3.11745602950312e-05, + "loss": 2.8777, + "step": 3633500 + }, + { + "epoch": 1.1296818145786154, + "grad_norm": 181.9139862060547, + "learning_rate": 3.117196975702308e-05, + "loss": 2.9506, + "step": 3634000 + }, + { + "epoch": 1.1298372468591023, + "grad_norm": 9.193655967712402, + "learning_rate": 3.116937921901497e-05, + "loss": 2.917, + "step": 3634500 + }, + { + "epoch": 1.1299926791395891, + "grad_norm": 10.255583763122559, + "learning_rate": 3.116678868100685e-05, + "loss": 2.8901, + "step": 3635000 + }, + { + "epoch": 1.130148111420076, + "grad_norm": 8.177725791931152, + "learning_rate": 3.1164198142998735e-05, + "loss": 2.877, + "step": 3635500 + }, + { + "epoch": 1.1303035437005629, + "grad_norm": 20.345888137817383, + "learning_rate": 3.116160760499062e-05, + "loss": 2.9302, + "step": 3636000 + }, + { + "epoch": 1.1304589759810497, + "grad_norm": 9.436029434204102, + "learning_rate": 3.11590170669825e-05, + "loss": 2.8846, + "step": 3636500 + }, + { + "epoch": 1.1306144082615366, + "grad_norm": 6.61063289642334, + "learning_rate": 3.115642652897439e-05, + "loss": 2.9014, + "step": 3637000 + }, + { + "epoch": 1.1307698405420235, + "grad_norm": 12.187776565551758, + "learning_rate": 3.115383599096628e-05, + "loss": 2.8786, + "step": 3637500 + }, + { + "epoch": 1.1309252728225103, + "grad_norm": 9.217347145080566, + "learning_rate": 3.1151245452958164e-05, + "loss": 2.9267, + "step": 3638000 + }, + { + "epoch": 1.1310807051029972, + "grad_norm": 9.72309684753418, + "learning_rate": 3.114865491495005e-05, + "loss": 2.9038, + "step": 3638500 + }, + { + "epoch": 1.131236137383484, + "grad_norm": 7.664508819580078, + "learning_rate": 3.114606437694194e-05, + "loss": 2.9633, + "step": 3639000 + }, + { + "epoch": 1.131391569663971, + "grad_norm": 26.003753662109375, + "learning_rate": 3.114347383893382e-05, + "loss": 2.9289, + "step": 3639500 + }, + { + "epoch": 1.1315470019444578, + "grad_norm": 9.879563331604004, + "learning_rate": 3.1140883300925706e-05, + "loss": 2.8994, + "step": 3640000 + }, + { + "epoch": 1.1317024342249447, + "grad_norm": 9.430671691894531, + "learning_rate": 3.113829276291759e-05, + "loss": 2.8745, + "step": 3640500 + }, + { + "epoch": 1.1318578665054315, + "grad_norm": 10.142674446105957, + "learning_rate": 3.113570222490947e-05, + "loss": 2.9655, + "step": 3641000 + }, + { + "epoch": 1.1320132987859184, + "grad_norm": 13.021510124206543, + "learning_rate": 3.113311168690136e-05, + "loss": 2.8974, + "step": 3641500 + }, + { + "epoch": 1.1321687310664053, + "grad_norm": 12.032390594482422, + "learning_rate": 3.113052114889324e-05, + "loss": 2.9063, + "step": 3642000 + }, + { + "epoch": 1.1323241633468921, + "grad_norm": 21.294008255004883, + "learning_rate": 3.112793061088513e-05, + "loss": 2.9156, + "step": 3642500 + }, + { + "epoch": 1.132479595627379, + "grad_norm": 30.532958984375, + "learning_rate": 3.1125340072877015e-05, + "loss": 2.9231, + "step": 3643000 + }, + { + "epoch": 1.1326350279078659, + "grad_norm": 15.03750228881836, + "learning_rate": 3.11227495348689e-05, + "loss": 2.9539, + "step": 3643500 + }, + { + "epoch": 1.1327904601883527, + "grad_norm": 10.781147956848145, + "learning_rate": 3.112015899686079e-05, + "loss": 2.9444, + "step": 3644000 + }, + { + "epoch": 1.1329458924688398, + "grad_norm": 15.919504165649414, + "learning_rate": 3.1117568458852676e-05, + "loss": 2.8989, + "step": 3644500 + }, + { + "epoch": 1.1331013247493267, + "grad_norm": 11.198243141174316, + "learning_rate": 3.111497792084456e-05, + "loss": 2.9221, + "step": 3645000 + }, + { + "epoch": 1.1332567570298135, + "grad_norm": 10.085455894470215, + "learning_rate": 3.1112387382836444e-05, + "loss": 2.9022, + "step": 3645500 + }, + { + "epoch": 1.1334121893103004, + "grad_norm": 16.771608352661133, + "learning_rate": 3.110979684482833e-05, + "loss": 2.907, + "step": 3646000 + }, + { + "epoch": 1.1335676215907873, + "grad_norm": 10.951683044433594, + "learning_rate": 3.110720630682021e-05, + "loss": 2.9277, + "step": 3646500 + }, + { + "epoch": 1.1337230538712741, + "grad_norm": 9.701581954956055, + "learning_rate": 3.11046157688121e-05, + "loss": 2.8747, + "step": 3647000 + }, + { + "epoch": 1.133878486151761, + "grad_norm": 11.6000394821167, + "learning_rate": 3.1102025230803986e-05, + "loss": 2.8863, + "step": 3647500 + }, + { + "epoch": 1.1340339184322479, + "grad_norm": 15.987375259399414, + "learning_rate": 3.109943469279587e-05, + "loss": 2.8638, + "step": 3648000 + }, + { + "epoch": 1.1341893507127347, + "grad_norm": 8.677995681762695, + "learning_rate": 3.109684415478776e-05, + "loss": 2.9001, + "step": 3648500 + }, + { + "epoch": 1.1343447829932216, + "grad_norm": 12.059830665588379, + "learning_rate": 3.109425361677964e-05, + "loss": 2.8053, + "step": 3649000 + }, + { + "epoch": 1.1345002152737085, + "grad_norm": 8.477445602416992, + "learning_rate": 3.109166307877153e-05, + "loss": 2.8916, + "step": 3649500 + }, + { + "epoch": 1.1346556475541953, + "grad_norm": 7.871788501739502, + "learning_rate": 3.1089072540763415e-05, + "loss": 2.8865, + "step": 3650000 + }, + { + "epoch": 1.1348110798346822, + "grad_norm": 10.545938491821289, + "learning_rate": 3.1086482002755295e-05, + "loss": 2.7884, + "step": 3650500 + }, + { + "epoch": 1.134966512115169, + "grad_norm": 9.761231422424316, + "learning_rate": 3.108389146474718e-05, + "loss": 2.9058, + "step": 3651000 + }, + { + "epoch": 1.135121944395656, + "grad_norm": 8.67349624633789, + "learning_rate": 3.108130092673907e-05, + "loss": 2.8861, + "step": 3651500 + }, + { + "epoch": 1.1352773766761428, + "grad_norm": 27.784727096557617, + "learning_rate": 3.107871038873095e-05, + "loss": 2.8913, + "step": 3652000 + }, + { + "epoch": 1.1354328089566297, + "grad_norm": 8.429821014404297, + "learning_rate": 3.107611985072284e-05, + "loss": 2.9037, + "step": 3652500 + }, + { + "epoch": 1.1355882412371165, + "grad_norm": 8.91736125946045, + "learning_rate": 3.1073529312714724e-05, + "loss": 2.9067, + "step": 3653000 + }, + { + "epoch": 1.1357436735176034, + "grad_norm": 9.735246658325195, + "learning_rate": 3.107093877470661e-05, + "loss": 2.9222, + "step": 3653500 + }, + { + "epoch": 1.1358991057980903, + "grad_norm": 8.444623947143555, + "learning_rate": 3.10683482366985e-05, + "loss": 2.9238, + "step": 3654000 + }, + { + "epoch": 1.1360545380785771, + "grad_norm": 24.2415828704834, + "learning_rate": 3.106575769869038e-05, + "loss": 2.8697, + "step": 3654500 + }, + { + "epoch": 1.136209970359064, + "grad_norm": 10.131239891052246, + "learning_rate": 3.1063167160682266e-05, + "loss": 2.9168, + "step": 3655000 + }, + { + "epoch": 1.136365402639551, + "grad_norm": 8.741748809814453, + "learning_rate": 3.106057662267415e-05, + "loss": 2.8394, + "step": 3655500 + }, + { + "epoch": 1.136520834920038, + "grad_norm": 50.11614227294922, + "learning_rate": 3.1057986084666034e-05, + "loss": 2.877, + "step": 3656000 + }, + { + "epoch": 1.1366762672005248, + "grad_norm": 10.362800598144531, + "learning_rate": 3.105539554665792e-05, + "loss": 2.8759, + "step": 3656500 + }, + { + "epoch": 1.1368316994810117, + "grad_norm": 8.937915802001953, + "learning_rate": 3.105280500864981e-05, + "loss": 2.8615, + "step": 3657000 + }, + { + "epoch": 1.1369871317614986, + "grad_norm": 9.229535102844238, + "learning_rate": 3.1050214470641695e-05, + "loss": 2.9071, + "step": 3657500 + }, + { + "epoch": 1.1371425640419854, + "grad_norm": 8.355441093444824, + "learning_rate": 3.104762393263358e-05, + "loss": 2.9301, + "step": 3658000 + }, + { + "epoch": 1.1372979963224723, + "grad_norm": 6.51224422454834, + "learning_rate": 3.104503339462547e-05, + "loss": 2.8751, + "step": 3658500 + }, + { + "epoch": 1.1374534286029592, + "grad_norm": 11.69173812866211, + "learning_rate": 3.104244285661735e-05, + "loss": 2.9054, + "step": 3659000 + }, + { + "epoch": 1.137608860883446, + "grad_norm": 11.762825012207031, + "learning_rate": 3.103985231860924e-05, + "loss": 2.8837, + "step": 3659500 + }, + { + "epoch": 1.137764293163933, + "grad_norm": 11.183419227600098, + "learning_rate": 3.103726178060112e-05, + "loss": 2.8871, + "step": 3660000 + }, + { + "epoch": 1.1379197254444198, + "grad_norm": 17.23084259033203, + "learning_rate": 3.1034671242593004e-05, + "loss": 2.8687, + "step": 3660500 + }, + { + "epoch": 1.1380751577249066, + "grad_norm": 9.192919731140137, + "learning_rate": 3.103208070458489e-05, + "loss": 2.882, + "step": 3661000 + }, + { + "epoch": 1.1382305900053935, + "grad_norm": 8.830678939819336, + "learning_rate": 3.102949016657677e-05, + "loss": 2.8456, + "step": 3661500 + }, + { + "epoch": 1.1383860222858804, + "grad_norm": 8.438133239746094, + "learning_rate": 3.102689962856866e-05, + "loss": 2.9079, + "step": 3662000 + }, + { + "epoch": 1.1385414545663672, + "grad_norm": 9.686615943908691, + "learning_rate": 3.1024309090560546e-05, + "loss": 2.8802, + "step": 3662500 + }, + { + "epoch": 1.138696886846854, + "grad_norm": 68.03922271728516, + "learning_rate": 3.102171855255243e-05, + "loss": 2.8867, + "step": 3663000 + }, + { + "epoch": 1.138852319127341, + "grad_norm": 8.693840980529785, + "learning_rate": 3.101912801454432e-05, + "loss": 2.8923, + "step": 3663500 + }, + { + "epoch": 1.1390077514078278, + "grad_norm": 8.684671401977539, + "learning_rate": 3.101653747653621e-05, + "loss": 2.8569, + "step": 3664000 + }, + { + "epoch": 1.1391631836883147, + "grad_norm": 8.503917694091797, + "learning_rate": 3.101394693852809e-05, + "loss": 2.8771, + "step": 3664500 + }, + { + "epoch": 1.1393186159688016, + "grad_norm": 8.144817352294922, + "learning_rate": 3.1011356400519975e-05, + "loss": 2.9031, + "step": 3665000 + }, + { + "epoch": 1.1394740482492884, + "grad_norm": 9.24911880493164, + "learning_rate": 3.1008765862511856e-05, + "loss": 2.8867, + "step": 3665500 + }, + { + "epoch": 1.1396294805297753, + "grad_norm": 8.57437801361084, + "learning_rate": 3.100617532450374e-05, + "loss": 3.0117, + "step": 3666000 + }, + { + "epoch": 1.1397849128102622, + "grad_norm": 14.941328048706055, + "learning_rate": 3.100358478649563e-05, + "loss": 2.9135, + "step": 3666500 + }, + { + "epoch": 1.139940345090749, + "grad_norm": 14.345149040222168, + "learning_rate": 3.100099424848752e-05, + "loss": 2.9151, + "step": 3667000 + }, + { + "epoch": 1.140095777371236, + "grad_norm": 9.178128242492676, + "learning_rate": 3.0998403710479404e-05, + "loss": 2.9056, + "step": 3667500 + }, + { + "epoch": 1.1402512096517228, + "grad_norm": 9.270119667053223, + "learning_rate": 3.099581317247129e-05, + "loss": 2.9083, + "step": 3668000 + }, + { + "epoch": 1.1404066419322096, + "grad_norm": 11.34552001953125, + "learning_rate": 3.099322263446317e-05, + "loss": 2.9213, + "step": 3668500 + }, + { + "epoch": 1.1405620742126967, + "grad_norm": 8.259502410888672, + "learning_rate": 3.099063209645506e-05, + "loss": 2.892, + "step": 3669000 + }, + { + "epoch": 1.1407175064931836, + "grad_norm": 11.6276273727417, + "learning_rate": 3.0988041558446946e-05, + "loss": 2.8031, + "step": 3669500 + }, + { + "epoch": 1.1408729387736705, + "grad_norm": 9.460600852966309, + "learning_rate": 3.0985451020438826e-05, + "loss": 2.9354, + "step": 3670000 + }, + { + "epoch": 1.1410283710541573, + "grad_norm": 8.132425308227539, + "learning_rate": 3.0982860482430714e-05, + "loss": 2.9051, + "step": 3670500 + }, + { + "epoch": 1.1411838033346442, + "grad_norm": 8.006209373474121, + "learning_rate": 3.0980269944422594e-05, + "loss": 2.879, + "step": 3671000 + }, + { + "epoch": 1.141339235615131, + "grad_norm": 8.883816719055176, + "learning_rate": 3.097767940641448e-05, + "loss": 2.9087, + "step": 3671500 + }, + { + "epoch": 1.141494667895618, + "grad_norm": 10.318185806274414, + "learning_rate": 3.097508886840637e-05, + "loss": 2.8823, + "step": 3672000 + }, + { + "epoch": 1.1416501001761048, + "grad_norm": 10.875654220581055, + "learning_rate": 3.0972498330398255e-05, + "loss": 2.9426, + "step": 3672500 + }, + { + "epoch": 1.1418055324565917, + "grad_norm": 17.054460525512695, + "learning_rate": 3.096990779239014e-05, + "loss": 2.8826, + "step": 3673000 + }, + { + "epoch": 1.1419609647370785, + "grad_norm": 8.773521423339844, + "learning_rate": 3.096731725438203e-05, + "loss": 2.9154, + "step": 3673500 + }, + { + "epoch": 1.1421163970175654, + "grad_norm": 11.854867935180664, + "learning_rate": 3.096472671637391e-05, + "loss": 2.8642, + "step": 3674000 + }, + { + "epoch": 1.1422718292980523, + "grad_norm": 9.529500961303711, + "learning_rate": 3.09621361783658e-05, + "loss": 2.8341, + "step": 3674500 + }, + { + "epoch": 1.1424272615785391, + "grad_norm": 16.41471290588379, + "learning_rate": 3.0959545640357684e-05, + "loss": 2.8732, + "step": 3675000 + }, + { + "epoch": 1.142582693859026, + "grad_norm": 10.031516075134277, + "learning_rate": 3.0956955102349565e-05, + "loss": 2.9249, + "step": 3675500 + }, + { + "epoch": 1.1427381261395129, + "grad_norm": 9.563793182373047, + "learning_rate": 3.095436456434145e-05, + "loss": 2.8379, + "step": 3676000 + }, + { + "epoch": 1.1428935584199997, + "grad_norm": 26.403554916381836, + "learning_rate": 3.095177402633334e-05, + "loss": 2.9179, + "step": 3676500 + }, + { + "epoch": 1.1430489907004866, + "grad_norm": 7.987534999847412, + "learning_rate": 3.0949183488325226e-05, + "loss": 2.9149, + "step": 3677000 + }, + { + "epoch": 1.1432044229809735, + "grad_norm": 24.763641357421875, + "learning_rate": 3.0946592950317113e-05, + "loss": 2.8892, + "step": 3677500 + }, + { + "epoch": 1.1433598552614603, + "grad_norm": 8.582076072692871, + "learning_rate": 3.0944002412308994e-05, + "loss": 2.8531, + "step": 3678000 + }, + { + "epoch": 1.1435152875419472, + "grad_norm": 14.169878959655762, + "learning_rate": 3.094141187430088e-05, + "loss": 2.9044, + "step": 3678500 + }, + { + "epoch": 1.143670719822434, + "grad_norm": 12.271045684814453, + "learning_rate": 3.093882133629277e-05, + "loss": 2.8546, + "step": 3679000 + }, + { + "epoch": 1.1438261521029212, + "grad_norm": 9.250024795532227, + "learning_rate": 3.093623079828465e-05, + "loss": 2.8885, + "step": 3679500 + }, + { + "epoch": 1.143981584383408, + "grad_norm": 7.750658988952637, + "learning_rate": 3.0933640260276536e-05, + "loss": 2.8944, + "step": 3680000 + }, + { + "epoch": 1.144137016663895, + "grad_norm": 8.46078109741211, + "learning_rate": 3.093104972226842e-05, + "loss": 2.8799, + "step": 3680500 + }, + { + "epoch": 1.1442924489443818, + "grad_norm": 8.586504936218262, + "learning_rate": 3.09284591842603e-05, + "loss": 2.8649, + "step": 3681000 + }, + { + "epoch": 1.1444478812248686, + "grad_norm": 9.896004676818848, + "learning_rate": 3.092586864625219e-05, + "loss": 2.892, + "step": 3681500 + }, + { + "epoch": 1.1446033135053555, + "grad_norm": 8.861649513244629, + "learning_rate": 3.092327810824408e-05, + "loss": 2.891, + "step": 3682000 + }, + { + "epoch": 1.1447587457858424, + "grad_norm": 9.170988082885742, + "learning_rate": 3.0920687570235965e-05, + "loss": 2.9392, + "step": 3682500 + }, + { + "epoch": 1.1449141780663292, + "grad_norm": 9.633795738220215, + "learning_rate": 3.091809703222785e-05, + "loss": 2.9286, + "step": 3683000 + }, + { + "epoch": 1.145069610346816, + "grad_norm": 9.10450267791748, + "learning_rate": 3.091550649421973e-05, + "loss": 2.8609, + "step": 3683500 + }, + { + "epoch": 1.145225042627303, + "grad_norm": 8.880051612854004, + "learning_rate": 3.091291595621162e-05, + "loss": 2.877, + "step": 3684000 + }, + { + "epoch": 1.1453804749077898, + "grad_norm": 9.82390022277832, + "learning_rate": 3.0910325418203506e-05, + "loss": 2.8806, + "step": 3684500 + }, + { + "epoch": 1.1455359071882767, + "grad_norm": 12.356223106384277, + "learning_rate": 3.090773488019539e-05, + "loss": 2.9397, + "step": 3685000 + }, + { + "epoch": 1.1456913394687636, + "grad_norm": 10.805485725402832, + "learning_rate": 3.0905144342187274e-05, + "loss": 2.928, + "step": 3685500 + }, + { + "epoch": 1.1458467717492504, + "grad_norm": 9.73098373413086, + "learning_rate": 3.090255380417916e-05, + "loss": 2.9134, + "step": 3686000 + }, + { + "epoch": 1.1460022040297373, + "grad_norm": 9.097856521606445, + "learning_rate": 3.089996326617104e-05, + "loss": 2.9441, + "step": 3686500 + }, + { + "epoch": 1.1461576363102242, + "grad_norm": 9.25969409942627, + "learning_rate": 3.0897372728162935e-05, + "loss": 2.9323, + "step": 3687000 + }, + { + "epoch": 1.146313068590711, + "grad_norm": 10.307493209838867, + "learning_rate": 3.089478219015482e-05, + "loss": 2.905, + "step": 3687500 + }, + { + "epoch": 1.146468500871198, + "grad_norm": 9.19757080078125, + "learning_rate": 3.08921916521467e-05, + "loss": 2.8737, + "step": 3688000 + }, + { + "epoch": 1.1466239331516848, + "grad_norm": 15.388358116149902, + "learning_rate": 3.088960111413859e-05, + "loss": 2.8763, + "step": 3688500 + }, + { + "epoch": 1.1467793654321716, + "grad_norm": 9.901603698730469, + "learning_rate": 3.088701057613047e-05, + "loss": 2.8987, + "step": 3689000 + }, + { + "epoch": 1.1469347977126585, + "grad_norm": 9.47506046295166, + "learning_rate": 3.088442003812236e-05, + "loss": 2.8731, + "step": 3689500 + }, + { + "epoch": 1.1470902299931454, + "grad_norm": 16.833040237426758, + "learning_rate": 3.0881829500114245e-05, + "loss": 2.8941, + "step": 3690000 + }, + { + "epoch": 1.1472456622736322, + "grad_norm": 10.784590721130371, + "learning_rate": 3.0879238962106125e-05, + "loss": 2.9008, + "step": 3690500 + }, + { + "epoch": 1.147401094554119, + "grad_norm": 8.099569320678711, + "learning_rate": 3.087664842409801e-05, + "loss": 2.8959, + "step": 3691000 + }, + { + "epoch": 1.147556526834606, + "grad_norm": 9.589038848876953, + "learning_rate": 3.08740578860899e-05, + "loss": 2.8476, + "step": 3691500 + }, + { + "epoch": 1.1477119591150928, + "grad_norm": 8.339150428771973, + "learning_rate": 3.087146734808179e-05, + "loss": 2.8763, + "step": 3692000 + }, + { + "epoch": 1.1478673913955797, + "grad_norm": 10.958130836486816, + "learning_rate": 3.0868876810073674e-05, + "loss": 2.8845, + "step": 3692500 + }, + { + "epoch": 1.1480228236760668, + "grad_norm": 15.6001558303833, + "learning_rate": 3.086628627206556e-05, + "loss": 2.8807, + "step": 3693000 + }, + { + "epoch": 1.1481782559565537, + "grad_norm": 10.169745445251465, + "learning_rate": 3.086369573405744e-05, + "loss": 2.8448, + "step": 3693500 + }, + { + "epoch": 1.1483336882370405, + "grad_norm": 10.092588424682617, + "learning_rate": 3.086110519604933e-05, + "loss": 2.9059, + "step": 3694000 + }, + { + "epoch": 1.1484891205175274, + "grad_norm": 11.084443092346191, + "learning_rate": 3.0858514658041216e-05, + "loss": 2.8597, + "step": 3694500 + }, + { + "epoch": 1.1486445527980143, + "grad_norm": 13.082042694091797, + "learning_rate": 3.0855924120033096e-05, + "loss": 2.882, + "step": 3695000 + }, + { + "epoch": 1.1487999850785011, + "grad_norm": 11.753702163696289, + "learning_rate": 3.085333358202498e-05, + "loss": 2.8396, + "step": 3695500 + }, + { + "epoch": 1.148955417358988, + "grad_norm": 9.503456115722656, + "learning_rate": 3.0850743044016864e-05, + "loss": 2.8505, + "step": 3696000 + }, + { + "epoch": 1.1491108496394749, + "grad_norm": 8.573623657226562, + "learning_rate": 3.084815250600875e-05, + "loss": 2.8654, + "step": 3696500 + }, + { + "epoch": 1.1492662819199617, + "grad_norm": 9.78592300415039, + "learning_rate": 3.0845561968000645e-05, + "loss": 2.8509, + "step": 3697000 + }, + { + "epoch": 1.1494217142004486, + "grad_norm": 6.75507926940918, + "learning_rate": 3.0842971429992525e-05, + "loss": 2.8919, + "step": 3697500 + }, + { + "epoch": 1.1495771464809355, + "grad_norm": 8.313224792480469, + "learning_rate": 3.084038089198441e-05, + "loss": 2.8726, + "step": 3698000 + }, + { + "epoch": 1.1497325787614223, + "grad_norm": 9.471384048461914, + "learning_rate": 3.08377903539763e-05, + "loss": 2.8346, + "step": 3698500 + }, + { + "epoch": 1.1498880110419092, + "grad_norm": 7.3656206130981445, + "learning_rate": 3.083519981596818e-05, + "loss": 2.8708, + "step": 3699000 + }, + { + "epoch": 1.150043443322396, + "grad_norm": 12.794137954711914, + "learning_rate": 3.083260927796007e-05, + "loss": 2.847, + "step": 3699500 + }, + { + "epoch": 1.150198875602883, + "grad_norm": 19.250139236450195, + "learning_rate": 3.0830018739951954e-05, + "loss": 2.8907, + "step": 3700000 + }, + { + "epoch": 1.1503543078833698, + "grad_norm": 9.453936576843262, + "learning_rate": 3.0827428201943834e-05, + "loss": 2.871, + "step": 3700500 + }, + { + "epoch": 1.1505097401638567, + "grad_norm": 10.135859489440918, + "learning_rate": 3.082483766393572e-05, + "loss": 2.8558, + "step": 3701000 + }, + { + "epoch": 1.1506651724443435, + "grad_norm": 10.610391616821289, + "learning_rate": 3.082224712592761e-05, + "loss": 2.884, + "step": 3701500 + }, + { + "epoch": 1.1508206047248304, + "grad_norm": 9.43783187866211, + "learning_rate": 3.0819656587919496e-05, + "loss": 2.907, + "step": 3702000 + }, + { + "epoch": 1.1509760370053173, + "grad_norm": 8.662869453430176, + "learning_rate": 3.081706604991138e-05, + "loss": 2.9361, + "step": 3702500 + }, + { + "epoch": 1.1511314692858041, + "grad_norm": 8.064444541931152, + "learning_rate": 3.081447551190326e-05, + "loss": 2.8553, + "step": 3703000 + }, + { + "epoch": 1.1512869015662912, + "grad_norm": 8.506932258605957, + "learning_rate": 3.081188497389515e-05, + "loss": 2.8676, + "step": 3703500 + }, + { + "epoch": 1.151442333846778, + "grad_norm": 9.811027526855469, + "learning_rate": 3.080929443588704e-05, + "loss": 2.9419, + "step": 3704000 + }, + { + "epoch": 1.151597766127265, + "grad_norm": 10.66740894317627, + "learning_rate": 3.080670389787892e-05, + "loss": 2.8627, + "step": 3704500 + }, + { + "epoch": 1.1517531984077518, + "grad_norm": 8.454649925231934, + "learning_rate": 3.0804113359870805e-05, + "loss": 2.8633, + "step": 3705000 + }, + { + "epoch": 1.1519086306882387, + "grad_norm": 9.044727325439453, + "learning_rate": 3.080152282186269e-05, + "loss": 2.8785, + "step": 3705500 + }, + { + "epoch": 1.1520640629687255, + "grad_norm": 8.032092094421387, + "learning_rate": 3.079893228385457e-05, + "loss": 2.8311, + "step": 3706000 + }, + { + "epoch": 1.1522194952492124, + "grad_norm": 10.420334815979004, + "learning_rate": 3.079634174584646e-05, + "loss": 2.8371, + "step": 3706500 + }, + { + "epoch": 1.1523749275296993, + "grad_norm": 10.893536567687988, + "learning_rate": 3.079375120783835e-05, + "loss": 2.8552, + "step": 3707000 + }, + { + "epoch": 1.1525303598101861, + "grad_norm": 7.756669044494629, + "learning_rate": 3.0791160669830234e-05, + "loss": 2.872, + "step": 3707500 + }, + { + "epoch": 1.152685792090673, + "grad_norm": 7.579336166381836, + "learning_rate": 3.078857013182212e-05, + "loss": 2.8774, + "step": 3708000 + }, + { + "epoch": 1.1528412243711599, + "grad_norm": 9.196995735168457, + "learning_rate": 3.0785979593814e-05, + "loss": 2.8604, + "step": 3708500 + }, + { + "epoch": 1.1529966566516467, + "grad_norm": 32.974517822265625, + "learning_rate": 3.078338905580589e-05, + "loss": 2.8196, + "step": 3709000 + }, + { + "epoch": 1.1531520889321336, + "grad_norm": 12.20357894897461, + "learning_rate": 3.0780798517797776e-05, + "loss": 2.881, + "step": 3709500 + }, + { + "epoch": 1.1533075212126205, + "grad_norm": 10.147254943847656, + "learning_rate": 3.0778207979789656e-05, + "loss": 2.8914, + "step": 3710000 + }, + { + "epoch": 1.1534629534931073, + "grad_norm": 9.302989959716797, + "learning_rate": 3.0775617441781544e-05, + "loss": 2.9319, + "step": 3710500 + }, + { + "epoch": 1.1536183857735942, + "grad_norm": 8.280462265014648, + "learning_rate": 3.077302690377343e-05, + "loss": 2.8532, + "step": 3711000 + }, + { + "epoch": 1.153773818054081, + "grad_norm": 13.463823318481445, + "learning_rate": 3.077043636576532e-05, + "loss": 2.8781, + "step": 3711500 + }, + { + "epoch": 1.153929250334568, + "grad_norm": 9.025784492492676, + "learning_rate": 3.0767845827757205e-05, + "loss": 2.8379, + "step": 3712000 + }, + { + "epoch": 1.1540846826150548, + "grad_norm": 11.260075569152832, + "learning_rate": 3.076525528974909e-05, + "loss": 2.891, + "step": 3712500 + }, + { + "epoch": 1.1542401148955417, + "grad_norm": 9.97148323059082, + "learning_rate": 3.076266475174097e-05, + "loss": 2.8769, + "step": 3713000 + }, + { + "epoch": 1.1543955471760285, + "grad_norm": 8.028395652770996, + "learning_rate": 3.076007421373286e-05, + "loss": 2.8892, + "step": 3713500 + }, + { + "epoch": 1.1545509794565154, + "grad_norm": 10.22634506225586, + "learning_rate": 3.075748367572474e-05, + "loss": 2.8588, + "step": 3714000 + }, + { + "epoch": 1.1547064117370023, + "grad_norm": 11.604952812194824, + "learning_rate": 3.075489313771663e-05, + "loss": 2.8335, + "step": 3714500 + }, + { + "epoch": 1.1548618440174891, + "grad_norm": 10.595090866088867, + "learning_rate": 3.0752302599708514e-05, + "loss": 2.8618, + "step": 3715000 + }, + { + "epoch": 1.155017276297976, + "grad_norm": 18.184141159057617, + "learning_rate": 3.0749712061700395e-05, + "loss": 2.939, + "step": 3715500 + }, + { + "epoch": 1.1551727085784629, + "grad_norm": 7.356186866760254, + "learning_rate": 3.074712152369228e-05, + "loss": 2.9292, + "step": 3716000 + }, + { + "epoch": 1.1553281408589497, + "grad_norm": 9.97861385345459, + "learning_rate": 3.074453098568417e-05, + "loss": 2.8775, + "step": 3716500 + }, + { + "epoch": 1.1554835731394368, + "grad_norm": 9.210124015808105, + "learning_rate": 3.0741940447676056e-05, + "loss": 2.8774, + "step": 3717000 + }, + { + "epoch": 1.1556390054199237, + "grad_norm": 9.324450492858887, + "learning_rate": 3.073934990966794e-05, + "loss": 2.891, + "step": 3717500 + }, + { + "epoch": 1.1557944377004106, + "grad_norm": 8.048367500305176, + "learning_rate": 3.073675937165983e-05, + "loss": 2.9042, + "step": 3718000 + }, + { + "epoch": 1.1559498699808974, + "grad_norm": 11.68290901184082, + "learning_rate": 3.073416883365171e-05, + "loss": 2.8472, + "step": 3718500 + }, + { + "epoch": 1.1561053022613843, + "grad_norm": 9.346553802490234, + "learning_rate": 3.07315782956436e-05, + "loss": 2.8613, + "step": 3719000 + }, + { + "epoch": 1.1562607345418712, + "grad_norm": 8.56692886352539, + "learning_rate": 3.072898775763548e-05, + "loss": 2.839, + "step": 3719500 + }, + { + "epoch": 1.156416166822358, + "grad_norm": 7.698729038238525, + "learning_rate": 3.0726397219627366e-05, + "loss": 2.85, + "step": 3720000 + }, + { + "epoch": 1.156571599102845, + "grad_norm": 6.946708679199219, + "learning_rate": 3.072380668161925e-05, + "loss": 2.8811, + "step": 3720500 + }, + { + "epoch": 1.1567270313833318, + "grad_norm": 6.997478008270264, + "learning_rate": 3.072121614361114e-05, + "loss": 2.8655, + "step": 3721000 + }, + { + "epoch": 1.1568824636638186, + "grad_norm": 9.399158477783203, + "learning_rate": 3.071862560560303e-05, + "loss": 2.8282, + "step": 3721500 + }, + { + "epoch": 1.1570378959443055, + "grad_norm": 6.648499965667725, + "learning_rate": 3.0716035067594914e-05, + "loss": 2.8438, + "step": 3722000 + }, + { + "epoch": 1.1571933282247924, + "grad_norm": 6.416041851043701, + "learning_rate": 3.0713444529586795e-05, + "loss": 2.8974, + "step": 3722500 + }, + { + "epoch": 1.1573487605052792, + "grad_norm": 8.014937400817871, + "learning_rate": 3.071085399157868e-05, + "loss": 2.848, + "step": 3723000 + }, + { + "epoch": 1.157504192785766, + "grad_norm": 13.116036415100098, + "learning_rate": 3.070826345357057e-05, + "loss": 2.8765, + "step": 3723500 + }, + { + "epoch": 1.157659625066253, + "grad_norm": 11.359403610229492, + "learning_rate": 3.070567291556245e-05, + "loss": 2.8355, + "step": 3724000 + }, + { + "epoch": 1.1578150573467398, + "grad_norm": 8.800802230834961, + "learning_rate": 3.0703082377554336e-05, + "loss": 2.8936, + "step": 3724500 + }, + { + "epoch": 1.1579704896272267, + "grad_norm": 12.572084426879883, + "learning_rate": 3.0700491839546224e-05, + "loss": 2.8507, + "step": 3725000 + }, + { + "epoch": 1.1581259219077136, + "grad_norm": 8.922894477844238, + "learning_rate": 3.0697901301538104e-05, + "loss": 2.8929, + "step": 3725500 + }, + { + "epoch": 1.1582813541882004, + "grad_norm": 9.917959213256836, + "learning_rate": 3.069531076352999e-05, + "loss": 2.8326, + "step": 3726000 + }, + { + "epoch": 1.1584367864686873, + "grad_norm": 6.911139011383057, + "learning_rate": 3.069272022552188e-05, + "loss": 2.8794, + "step": 3726500 + }, + { + "epoch": 1.1585922187491742, + "grad_norm": 18.094667434692383, + "learning_rate": 3.0690129687513765e-05, + "loss": 2.8272, + "step": 3727000 + }, + { + "epoch": 1.1587476510296613, + "grad_norm": 8.584526062011719, + "learning_rate": 3.068753914950565e-05, + "loss": 2.8452, + "step": 3727500 + }, + { + "epoch": 1.1589030833101481, + "grad_norm": 15.146541595458984, + "learning_rate": 3.068494861149753e-05, + "loss": 2.899, + "step": 3728000 + }, + { + "epoch": 1.159058515590635, + "grad_norm": 9.23239517211914, + "learning_rate": 3.068235807348942e-05, + "loss": 2.926, + "step": 3728500 + }, + { + "epoch": 1.1592139478711219, + "grad_norm": 12.262267112731934, + "learning_rate": 3.067976753548131e-05, + "loss": 2.9075, + "step": 3729000 + }, + { + "epoch": 1.1593693801516087, + "grad_norm": 12.564126014709473, + "learning_rate": 3.067717699747319e-05, + "loss": 2.8841, + "step": 3729500 + }, + { + "epoch": 1.1595248124320956, + "grad_norm": 10.20461368560791, + "learning_rate": 3.0674586459465075e-05, + "loss": 2.9359, + "step": 3730000 + }, + { + "epoch": 1.1596802447125825, + "grad_norm": 24.44304084777832, + "learning_rate": 3.067199592145696e-05, + "loss": 2.8967, + "step": 3730500 + }, + { + "epoch": 1.1598356769930693, + "grad_norm": 11.553668022155762, + "learning_rate": 3.066940538344885e-05, + "loss": 2.8587, + "step": 3731000 + }, + { + "epoch": 1.1599911092735562, + "grad_norm": 10.376814842224121, + "learning_rate": 3.0666814845440736e-05, + "loss": 2.8111, + "step": 3731500 + }, + { + "epoch": 1.160146541554043, + "grad_norm": 10.647350311279297, + "learning_rate": 3.0664224307432617e-05, + "loss": 2.8354, + "step": 3732000 + }, + { + "epoch": 1.16030197383453, + "grad_norm": 9.740877151489258, + "learning_rate": 3.0661633769424504e-05, + "loss": 2.8302, + "step": 3732500 + }, + { + "epoch": 1.1604574061150168, + "grad_norm": 6.541997909545898, + "learning_rate": 3.065904323141639e-05, + "loss": 2.8534, + "step": 3733000 + }, + { + "epoch": 1.1606128383955037, + "grad_norm": 9.331857681274414, + "learning_rate": 3.065645269340827e-05, + "loss": 2.8314, + "step": 3733500 + }, + { + "epoch": 1.1607682706759905, + "grad_norm": 9.075052261352539, + "learning_rate": 3.065386215540016e-05, + "loss": 2.8957, + "step": 3734000 + }, + { + "epoch": 1.1609237029564774, + "grad_norm": 8.338165283203125, + "learning_rate": 3.0651271617392046e-05, + "loss": 2.9089, + "step": 3734500 + }, + { + "epoch": 1.1610791352369643, + "grad_norm": 8.670093536376953, + "learning_rate": 3.0648681079383926e-05, + "loss": 2.8582, + "step": 3735000 + }, + { + "epoch": 1.1612345675174511, + "grad_norm": 9.51729965209961, + "learning_rate": 3.064609054137581e-05, + "loss": 2.8504, + "step": 3735500 + }, + { + "epoch": 1.161389999797938, + "grad_norm": 10.069223403930664, + "learning_rate": 3.06435000033677e-05, + "loss": 2.8883, + "step": 3736000 + }, + { + "epoch": 1.1615454320784249, + "grad_norm": 9.334388732910156, + "learning_rate": 3.064090946535959e-05, + "loss": 2.862, + "step": 3736500 + }, + { + "epoch": 1.1617008643589117, + "grad_norm": 9.704655647277832, + "learning_rate": 3.0638318927351475e-05, + "loss": 2.8775, + "step": 3737000 + }, + { + "epoch": 1.1618562966393986, + "grad_norm": 11.659305572509766, + "learning_rate": 3.0635728389343355e-05, + "loss": 2.8927, + "step": 3737500 + }, + { + "epoch": 1.1620117289198855, + "grad_norm": 10.313007354736328, + "learning_rate": 3.063313785133524e-05, + "loss": 2.8821, + "step": 3738000 + }, + { + "epoch": 1.1621671612003723, + "grad_norm": 9.87134838104248, + "learning_rate": 3.063054731332713e-05, + "loss": 2.8669, + "step": 3738500 + }, + { + "epoch": 1.1623225934808592, + "grad_norm": 10.481751441955566, + "learning_rate": 3.062795677531901e-05, + "loss": 2.8783, + "step": 3739000 + }, + { + "epoch": 1.162478025761346, + "grad_norm": 16.28342628479004, + "learning_rate": 3.06253662373109e-05, + "loss": 2.8365, + "step": 3739500 + }, + { + "epoch": 1.162633458041833, + "grad_norm": 8.416786193847656, + "learning_rate": 3.0622775699302784e-05, + "loss": 2.9449, + "step": 3740000 + }, + { + "epoch": 1.1627888903223198, + "grad_norm": 8.801613807678223, + "learning_rate": 3.062018516129467e-05, + "loss": 2.8902, + "step": 3740500 + }, + { + "epoch": 1.162944322602807, + "grad_norm": 8.824114799499512, + "learning_rate": 3.061759462328656e-05, + "loss": 2.8945, + "step": 3741000 + }, + { + "epoch": 1.1630997548832938, + "grad_norm": 9.994221687316895, + "learning_rate": 3.0615004085278445e-05, + "loss": 2.8707, + "step": 3741500 + }, + { + "epoch": 1.1632551871637806, + "grad_norm": 81.36638641357422, + "learning_rate": 3.0612413547270326e-05, + "loss": 2.8353, + "step": 3742000 + }, + { + "epoch": 1.1634106194442675, + "grad_norm": 5.383805274963379, + "learning_rate": 3.060982300926221e-05, + "loss": 2.8517, + "step": 3742500 + }, + { + "epoch": 1.1635660517247544, + "grad_norm": 10.150308609008789, + "learning_rate": 3.06072324712541e-05, + "loss": 2.8511, + "step": 3743000 + }, + { + "epoch": 1.1637214840052412, + "grad_norm": 10.732014656066895, + "learning_rate": 3.060464193324598e-05, + "loss": 2.8303, + "step": 3743500 + }, + { + "epoch": 1.163876916285728, + "grad_norm": 6.28338623046875, + "learning_rate": 3.060205139523787e-05, + "loss": 2.8658, + "step": 3744000 + }, + { + "epoch": 1.164032348566215, + "grad_norm": 8.247649192810059, + "learning_rate": 3.059946085722975e-05, + "loss": 2.8903, + "step": 3744500 + }, + { + "epoch": 1.1641877808467018, + "grad_norm": 6.873173713684082, + "learning_rate": 3.0596870319221635e-05, + "loss": 2.8684, + "step": 3745000 + }, + { + "epoch": 1.1643432131271887, + "grad_norm": 14.570939064025879, + "learning_rate": 3.059427978121352e-05, + "loss": 2.8544, + "step": 3745500 + }, + { + "epoch": 1.1644986454076756, + "grad_norm": 12.460341453552246, + "learning_rate": 3.059168924320541e-05, + "loss": 2.8835, + "step": 3746000 + }, + { + "epoch": 1.1646540776881624, + "grad_norm": 8.627159118652344, + "learning_rate": 3.05890987051973e-05, + "loss": 2.8524, + "step": 3746500 + }, + { + "epoch": 1.1648095099686493, + "grad_norm": 10.359090805053711, + "learning_rate": 3.0586508167189184e-05, + "loss": 2.8398, + "step": 3747000 + }, + { + "epoch": 1.1649649422491362, + "grad_norm": 8.955473899841309, + "learning_rate": 3.0583917629181064e-05, + "loss": 2.8759, + "step": 3747500 + }, + { + "epoch": 1.165120374529623, + "grad_norm": 9.854158401489258, + "learning_rate": 3.058132709117295e-05, + "loss": 2.8542, + "step": 3748000 + }, + { + "epoch": 1.16527580681011, + "grad_norm": 10.1331787109375, + "learning_rate": 3.057873655316484e-05, + "loss": 2.8626, + "step": 3748500 + }, + { + "epoch": 1.1654312390905968, + "grad_norm": 54.98519515991211, + "learning_rate": 3.057614601515672e-05, + "loss": 2.8166, + "step": 3749000 + }, + { + "epoch": 1.1655866713710836, + "grad_norm": 11.065642356872559, + "learning_rate": 3.0573555477148606e-05, + "loss": 2.8548, + "step": 3749500 + }, + { + "epoch": 1.1657421036515705, + "grad_norm": 9.49869155883789, + "learning_rate": 3.0570964939140486e-05, + "loss": 2.8354, + "step": 3750000 + }, + { + "epoch": 1.1658975359320574, + "grad_norm": 18.543901443481445, + "learning_rate": 3.056837440113238e-05, + "loss": 2.845, + "step": 3750500 + }, + { + "epoch": 1.1660529682125442, + "grad_norm": 12.140480995178223, + "learning_rate": 3.056578386312427e-05, + "loss": 2.8444, + "step": 3751000 + }, + { + "epoch": 1.1662084004930313, + "grad_norm": 7.61896276473999, + "learning_rate": 3.056319332511615e-05, + "loss": 2.8856, + "step": 3751500 + }, + { + "epoch": 1.1663638327735182, + "grad_norm": 15.046345710754395, + "learning_rate": 3.0560602787108035e-05, + "loss": 2.8542, + "step": 3752000 + }, + { + "epoch": 1.166519265054005, + "grad_norm": 7.728577136993408, + "learning_rate": 3.055801224909992e-05, + "loss": 2.8471, + "step": 3752500 + }, + { + "epoch": 1.166674697334492, + "grad_norm": 9.332761764526367, + "learning_rate": 3.05554217110918e-05, + "loss": 2.8495, + "step": 3753000 + }, + { + "epoch": 1.1668301296149788, + "grad_norm": 8.007221221923828, + "learning_rate": 3.055283117308369e-05, + "loss": 2.8704, + "step": 3753500 + }, + { + "epoch": 1.1669855618954657, + "grad_norm": 10.804234504699707, + "learning_rate": 3.055024063507558e-05, + "loss": 2.904, + "step": 3754000 + }, + { + "epoch": 1.1671409941759525, + "grad_norm": 10.224113464355469, + "learning_rate": 3.054765009706746e-05, + "loss": 2.8591, + "step": 3754500 + }, + { + "epoch": 1.1672964264564394, + "grad_norm": 9.096250534057617, + "learning_rate": 3.0545059559059344e-05, + "loss": 2.8762, + "step": 3755000 + }, + { + "epoch": 1.1674518587369263, + "grad_norm": 10.065984725952148, + "learning_rate": 3.054246902105123e-05, + "loss": 2.8172, + "step": 3755500 + }, + { + "epoch": 1.1676072910174131, + "grad_norm": 8.64286994934082, + "learning_rate": 3.053987848304312e-05, + "loss": 2.8839, + "step": 3756000 + }, + { + "epoch": 1.1677627232979, + "grad_norm": 8.439372062683105, + "learning_rate": 3.0537287945035006e-05, + "loss": 2.8761, + "step": 3756500 + }, + { + "epoch": 1.1679181555783869, + "grad_norm": 10.63222885131836, + "learning_rate": 3.0534697407026886e-05, + "loss": 2.8521, + "step": 3757000 + }, + { + "epoch": 1.1680735878588737, + "grad_norm": 8.840168952941895, + "learning_rate": 3.053210686901877e-05, + "loss": 2.8708, + "step": 3757500 + }, + { + "epoch": 1.1682290201393606, + "grad_norm": 19.95561408996582, + "learning_rate": 3.052951633101066e-05, + "loss": 2.8476, + "step": 3758000 + }, + { + "epoch": 1.1683844524198475, + "grad_norm": 7.548054218292236, + "learning_rate": 3.052692579300254e-05, + "loss": 2.8313, + "step": 3758500 + }, + { + "epoch": 1.1685398847003343, + "grad_norm": 11.945965766906738, + "learning_rate": 3.052433525499443e-05, + "loss": 2.8398, + "step": 3759000 + }, + { + "epoch": 1.1686953169808212, + "grad_norm": 11.073527336120605, + "learning_rate": 3.0521744716986315e-05, + "loss": 2.8601, + "step": 3759500 + }, + { + "epoch": 1.168850749261308, + "grad_norm": 8.658404350280762, + "learning_rate": 3.0519154178978196e-05, + "loss": 2.829, + "step": 3760000 + }, + { + "epoch": 1.169006181541795, + "grad_norm": 10.702743530273438, + "learning_rate": 3.0516563640970086e-05, + "loss": 2.8569, + "step": 3760500 + }, + { + "epoch": 1.1691616138222818, + "grad_norm": 18.01536750793457, + "learning_rate": 3.0513973102961973e-05, + "loss": 2.8269, + "step": 3761000 + }, + { + "epoch": 1.1693170461027687, + "grad_norm": 12.649062156677246, + "learning_rate": 3.0511382564953857e-05, + "loss": 2.8524, + "step": 3761500 + }, + { + "epoch": 1.1694724783832555, + "grad_norm": 8.473977088928223, + "learning_rate": 3.0508792026945744e-05, + "loss": 2.8198, + "step": 3762000 + }, + { + "epoch": 1.1696279106637424, + "grad_norm": 15.59961223602295, + "learning_rate": 3.0506201488937625e-05, + "loss": 2.8209, + "step": 3762500 + }, + { + "epoch": 1.1697833429442293, + "grad_norm": 10.947473526000977, + "learning_rate": 3.050361095092951e-05, + "loss": 2.8048, + "step": 3763000 + }, + { + "epoch": 1.1699387752247161, + "grad_norm": 8.90805435180664, + "learning_rate": 3.05010204129214e-05, + "loss": 2.8068, + "step": 3763500 + }, + { + "epoch": 1.170094207505203, + "grad_norm": 8.481156349182129, + "learning_rate": 3.0498429874913283e-05, + "loss": 2.8548, + "step": 3764000 + }, + { + "epoch": 1.1702496397856899, + "grad_norm": 7.72632360458374, + "learning_rate": 3.049583933690517e-05, + "loss": 2.8361, + "step": 3764500 + }, + { + "epoch": 1.170405072066177, + "grad_norm": 9.134703636169434, + "learning_rate": 3.0493248798897057e-05, + "loss": 2.8264, + "step": 3765000 + }, + { + "epoch": 1.1705605043466638, + "grad_norm": 10.41663646697998, + "learning_rate": 3.0490658260888937e-05, + "loss": 2.8832, + "step": 3765500 + }, + { + "epoch": 1.1707159366271507, + "grad_norm": 7.969177722930908, + "learning_rate": 3.0488067722880824e-05, + "loss": 2.8259, + "step": 3766000 + }, + { + "epoch": 1.1708713689076375, + "grad_norm": 49.98832321166992, + "learning_rate": 3.048547718487271e-05, + "loss": 2.889, + "step": 3766500 + }, + { + "epoch": 1.1710268011881244, + "grad_norm": 15.525405883789062, + "learning_rate": 3.0482886646864595e-05, + "loss": 2.867, + "step": 3767000 + }, + { + "epoch": 1.1711822334686113, + "grad_norm": 13.462370872497559, + "learning_rate": 3.0480296108856483e-05, + "loss": 2.8552, + "step": 3767500 + }, + { + "epoch": 1.1713376657490981, + "grad_norm": 31.42702865600586, + "learning_rate": 3.0477705570848363e-05, + "loss": 2.8582, + "step": 3768000 + }, + { + "epoch": 1.171493098029585, + "grad_norm": 13.20004653930664, + "learning_rate": 3.047511503284025e-05, + "loss": 2.8579, + "step": 3768500 + }, + { + "epoch": 1.1716485303100719, + "grad_norm": 11.371710777282715, + "learning_rate": 3.0472524494832137e-05, + "loss": 2.8166, + "step": 3769000 + }, + { + "epoch": 1.1718039625905587, + "grad_norm": 10.054877281188965, + "learning_rate": 3.046993395682402e-05, + "loss": 2.8399, + "step": 3769500 + }, + { + "epoch": 1.1719593948710456, + "grad_norm": 11.53252124786377, + "learning_rate": 3.0467343418815908e-05, + "loss": 2.903, + "step": 3770000 + }, + { + "epoch": 1.1721148271515325, + "grad_norm": 23.256391525268555, + "learning_rate": 3.0464752880807795e-05, + "loss": 2.8849, + "step": 3770500 + }, + { + "epoch": 1.1722702594320193, + "grad_norm": 7.4883341789245605, + "learning_rate": 3.0462162342799676e-05, + "loss": 2.8221, + "step": 3771000 + }, + { + "epoch": 1.1724256917125062, + "grad_norm": 14.617647171020508, + "learning_rate": 3.0459571804791566e-05, + "loss": 2.864, + "step": 3771500 + }, + { + "epoch": 1.172581123992993, + "grad_norm": 10.128959655761719, + "learning_rate": 3.0456981266783453e-05, + "loss": 2.8561, + "step": 3772000 + }, + { + "epoch": 1.17273655627348, + "grad_norm": 9.354291915893555, + "learning_rate": 3.0454390728775334e-05, + "loss": 2.9297, + "step": 3772500 + }, + { + "epoch": 1.1728919885539668, + "grad_norm": 8.693360328674316, + "learning_rate": 3.045180019076722e-05, + "loss": 2.8735, + "step": 3773000 + }, + { + "epoch": 1.1730474208344537, + "grad_norm": 6.606147766113281, + "learning_rate": 3.0449209652759105e-05, + "loss": 2.8106, + "step": 3773500 + }, + { + "epoch": 1.1732028531149405, + "grad_norm": 9.057385444641113, + "learning_rate": 3.0446619114750992e-05, + "loss": 2.8109, + "step": 3774000 + }, + { + "epoch": 1.1733582853954274, + "grad_norm": 33.825218200683594, + "learning_rate": 3.044402857674288e-05, + "loss": 2.8271, + "step": 3774500 + }, + { + "epoch": 1.1735137176759143, + "grad_norm": 9.24666976928711, + "learning_rate": 3.044143803873476e-05, + "loss": 2.8929, + "step": 3775000 + }, + { + "epoch": 1.1736691499564014, + "grad_norm": 10.343999862670898, + "learning_rate": 3.0438847500726646e-05, + "loss": 2.8935, + "step": 3775500 + }, + { + "epoch": 1.1738245822368882, + "grad_norm": 9.033528327941895, + "learning_rate": 3.0436256962718534e-05, + "loss": 2.8497, + "step": 3776000 + }, + { + "epoch": 1.173980014517375, + "grad_norm": 8.542497634887695, + "learning_rate": 3.0433666424710417e-05, + "loss": 2.8313, + "step": 3776500 + }, + { + "epoch": 1.174135446797862, + "grad_norm": 7.515583038330078, + "learning_rate": 3.0431075886702305e-05, + "loss": 2.8607, + "step": 3777000 + }, + { + "epoch": 1.1742908790783488, + "grad_norm": 9.932984352111816, + "learning_rate": 3.0428485348694192e-05, + "loss": 2.8813, + "step": 3777500 + }, + { + "epoch": 1.1744463113588357, + "grad_norm": 8.018367767333984, + "learning_rate": 3.0425894810686072e-05, + "loss": 2.8199, + "step": 3778000 + }, + { + "epoch": 1.1746017436393226, + "grad_norm": 4.895200252532959, + "learning_rate": 3.042330427267796e-05, + "loss": 2.813, + "step": 3778500 + }, + { + "epoch": 1.1747571759198094, + "grad_norm": 8.611351013183594, + "learning_rate": 3.0420713734669846e-05, + "loss": 2.8418, + "step": 3779000 + }, + { + "epoch": 1.1749126082002963, + "grad_norm": 7.4723615646362305, + "learning_rate": 3.041812319666173e-05, + "loss": 2.8558, + "step": 3779500 + }, + { + "epoch": 1.1750680404807832, + "grad_norm": 8.181377410888672, + "learning_rate": 3.0415532658653617e-05, + "loss": 2.8627, + "step": 3780000 + }, + { + "epoch": 1.17522347276127, + "grad_norm": 9.568248748779297, + "learning_rate": 3.0412942120645498e-05, + "loss": 2.8719, + "step": 3780500 + }, + { + "epoch": 1.175378905041757, + "grad_norm": 7.6568217277526855, + "learning_rate": 3.0410351582637385e-05, + "loss": 2.8739, + "step": 3781000 + }, + { + "epoch": 1.1755343373222438, + "grad_norm": 9.834476470947266, + "learning_rate": 3.0407761044629275e-05, + "loss": 2.8565, + "step": 3781500 + }, + { + "epoch": 1.1756897696027306, + "grad_norm": 11.713651657104492, + "learning_rate": 3.0405170506621156e-05, + "loss": 2.8272, + "step": 3782000 + }, + { + "epoch": 1.1758452018832175, + "grad_norm": 16.269638061523438, + "learning_rate": 3.0402579968613043e-05, + "loss": 2.8713, + "step": 3782500 + }, + { + "epoch": 1.1760006341637044, + "grad_norm": 12.485589981079102, + "learning_rate": 3.039998943060493e-05, + "loss": 2.8953, + "step": 3783000 + }, + { + "epoch": 1.1761560664441912, + "grad_norm": 9.307219505310059, + "learning_rate": 3.0397398892596814e-05, + "loss": 2.8761, + "step": 3783500 + }, + { + "epoch": 1.176311498724678, + "grad_norm": 9.052794456481934, + "learning_rate": 3.03948083545887e-05, + "loss": 2.8532, + "step": 3784000 + }, + { + "epoch": 1.176466931005165, + "grad_norm": 8.510709762573242, + "learning_rate": 3.0392217816580588e-05, + "loss": 2.8481, + "step": 3784500 + }, + { + "epoch": 1.1766223632856518, + "grad_norm": 9.365324974060059, + "learning_rate": 3.038962727857247e-05, + "loss": 2.8901, + "step": 3785000 + }, + { + "epoch": 1.1767777955661387, + "grad_norm": 13.198243141174316, + "learning_rate": 3.0387036740564356e-05, + "loss": 2.823, + "step": 3785500 + }, + { + "epoch": 1.1769332278466256, + "grad_norm": 8.597790718078613, + "learning_rate": 3.038444620255624e-05, + "loss": 2.8665, + "step": 3786000 + }, + { + "epoch": 1.1770886601271124, + "grad_norm": 9.666033744812012, + "learning_rate": 3.0381855664548127e-05, + "loss": 2.8829, + "step": 3786500 + }, + { + "epoch": 1.1772440924075993, + "grad_norm": 11.551353454589844, + "learning_rate": 3.0379265126540014e-05, + "loss": 2.8388, + "step": 3787000 + }, + { + "epoch": 1.1773995246880862, + "grad_norm": 10.56826400756836, + "learning_rate": 3.0376674588531894e-05, + "loss": 2.8502, + "step": 3787500 + }, + { + "epoch": 1.177554956968573, + "grad_norm": 8.992363929748535, + "learning_rate": 3.037408405052378e-05, + "loss": 2.8086, + "step": 3788000 + }, + { + "epoch": 1.17771038924906, + "grad_norm": 9.45030689239502, + "learning_rate": 3.037149351251567e-05, + "loss": 2.8351, + "step": 3788500 + }, + { + "epoch": 1.1778658215295468, + "grad_norm": 10.015976905822754, + "learning_rate": 3.0368902974507552e-05, + "loss": 2.8533, + "step": 3789000 + }, + { + "epoch": 1.1780212538100339, + "grad_norm": 9.237509727478027, + "learning_rate": 3.036631243649944e-05, + "loss": 2.8643, + "step": 3789500 + }, + { + "epoch": 1.1781766860905207, + "grad_norm": 8.872989654541016, + "learning_rate": 3.0363721898491326e-05, + "loss": 2.851, + "step": 3790000 + }, + { + "epoch": 1.1783321183710076, + "grad_norm": 8.117878913879395, + "learning_rate": 3.0361131360483207e-05, + "loss": 2.822, + "step": 3790500 + }, + { + "epoch": 1.1784875506514945, + "grad_norm": 9.378774642944336, + "learning_rate": 3.0358540822475094e-05, + "loss": 2.8592, + "step": 3791000 + }, + { + "epoch": 1.1786429829319813, + "grad_norm": 7.794562339782715, + "learning_rate": 3.0355950284466978e-05, + "loss": 2.8789, + "step": 3791500 + }, + { + "epoch": 1.1787984152124682, + "grad_norm": 10.391420364379883, + "learning_rate": 3.0353359746458865e-05, + "loss": 2.8748, + "step": 3792000 + }, + { + "epoch": 1.178953847492955, + "grad_norm": 9.19128131866455, + "learning_rate": 3.0350769208450752e-05, + "loss": 2.8247, + "step": 3792500 + }, + { + "epoch": 1.179109279773442, + "grad_norm": 12.329371452331543, + "learning_rate": 3.0348178670442632e-05, + "loss": 2.8223, + "step": 3793000 + }, + { + "epoch": 1.1792647120539288, + "grad_norm": 9.783985137939453, + "learning_rate": 3.0345588132434523e-05, + "loss": 2.8555, + "step": 3793500 + }, + { + "epoch": 1.1794201443344157, + "grad_norm": 45.2945671081543, + "learning_rate": 3.034299759442641e-05, + "loss": 2.875, + "step": 3794000 + }, + { + "epoch": 1.1795755766149025, + "grad_norm": 8.913747787475586, + "learning_rate": 3.034040705641829e-05, + "loss": 2.8496, + "step": 3794500 + }, + { + "epoch": 1.1797310088953894, + "grad_norm": 14.370038032531738, + "learning_rate": 3.0337816518410178e-05, + "loss": 2.8662, + "step": 3795000 + }, + { + "epoch": 1.1798864411758763, + "grad_norm": 14.062255859375, + "learning_rate": 3.0335225980402065e-05, + "loss": 2.8385, + "step": 3795500 + }, + { + "epoch": 1.1800418734563631, + "grad_norm": 8.855382919311523, + "learning_rate": 3.033263544239395e-05, + "loss": 2.8653, + "step": 3796000 + }, + { + "epoch": 1.18019730573685, + "grad_norm": 11.994958877563477, + "learning_rate": 3.0330044904385836e-05, + "loss": 2.8203, + "step": 3796500 + }, + { + "epoch": 1.1803527380173369, + "grad_norm": 8.132678031921387, + "learning_rate": 3.0327454366377723e-05, + "loss": 2.8795, + "step": 3797000 + }, + { + "epoch": 1.1805081702978237, + "grad_norm": 10.530145645141602, + "learning_rate": 3.0324863828369603e-05, + "loss": 2.8392, + "step": 3797500 + }, + { + "epoch": 1.1806636025783106, + "grad_norm": 7.146368503570557, + "learning_rate": 3.032227329036149e-05, + "loss": 2.8032, + "step": 3798000 + }, + { + "epoch": 1.1808190348587975, + "grad_norm": 7.0665483474731445, + "learning_rate": 3.0319682752353374e-05, + "loss": 2.7971, + "step": 3798500 + }, + { + "epoch": 1.1809744671392843, + "grad_norm": 8.19948959350586, + "learning_rate": 3.031709221434526e-05, + "loss": 2.863, + "step": 3799000 + }, + { + "epoch": 1.1811298994197712, + "grad_norm": 13.80401611328125, + "learning_rate": 3.031450167633715e-05, + "loss": 2.8235, + "step": 3799500 + }, + { + "epoch": 1.1812853317002583, + "grad_norm": 13.169880867004395, + "learning_rate": 3.031191113832903e-05, + "loss": 2.8575, + "step": 3800000 + }, + { + "epoch": 1.1814407639807452, + "grad_norm": 9.651637077331543, + "learning_rate": 3.0309320600320916e-05, + "loss": 2.8864, + "step": 3800500 + }, + { + "epoch": 1.181596196261232, + "grad_norm": 7.772079944610596, + "learning_rate": 3.0306730062312803e-05, + "loss": 2.8301, + "step": 3801000 + }, + { + "epoch": 1.181751628541719, + "grad_norm": 8.025751113891602, + "learning_rate": 3.0304139524304687e-05, + "loss": 2.8331, + "step": 3801500 + }, + { + "epoch": 1.1819070608222058, + "grad_norm": 7.275542736053467, + "learning_rate": 3.0301548986296574e-05, + "loss": 2.8295, + "step": 3802000 + }, + { + "epoch": 1.1820624931026926, + "grad_norm": 9.18468189239502, + "learning_rate": 3.029895844828846e-05, + "loss": 2.8462, + "step": 3802500 + }, + { + "epoch": 1.1822179253831795, + "grad_norm": 11.89565658569336, + "learning_rate": 3.029636791028034e-05, + "loss": 2.8313, + "step": 3803000 + }, + { + "epoch": 1.1823733576636664, + "grad_norm": 13.685291290283203, + "learning_rate": 3.0293777372272232e-05, + "loss": 2.8792, + "step": 3803500 + }, + { + "epoch": 1.1825287899441532, + "grad_norm": 7.9654741287231445, + "learning_rate": 3.0291186834264113e-05, + "loss": 2.8649, + "step": 3804000 + }, + { + "epoch": 1.18268422222464, + "grad_norm": 7.9330735206604, + "learning_rate": 3.0288596296256e-05, + "loss": 2.8886, + "step": 3804500 + }, + { + "epoch": 1.182839654505127, + "grad_norm": 10.825921058654785, + "learning_rate": 3.0286005758247887e-05, + "loss": 2.8217, + "step": 3805000 + }, + { + "epoch": 1.1829950867856138, + "grad_norm": 8.817927360534668, + "learning_rate": 3.028341522023977e-05, + "loss": 2.8155, + "step": 3805500 + }, + { + "epoch": 1.1831505190661007, + "grad_norm": 7.126120567321777, + "learning_rate": 3.0280824682231658e-05, + "loss": 2.8643, + "step": 3806000 + }, + { + "epoch": 1.1833059513465876, + "grad_norm": 13.0680513381958, + "learning_rate": 3.0278234144223545e-05, + "loss": 2.8342, + "step": 3806500 + }, + { + "epoch": 1.1834613836270744, + "grad_norm": 10.190178871154785, + "learning_rate": 3.0275643606215425e-05, + "loss": 2.8481, + "step": 3807000 + }, + { + "epoch": 1.1836168159075613, + "grad_norm": 12.925643920898438, + "learning_rate": 3.0273053068207312e-05, + "loss": 2.848, + "step": 3807500 + }, + { + "epoch": 1.1837722481880482, + "grad_norm": 7.778632640838623, + "learning_rate": 3.02704625301992e-05, + "loss": 2.8563, + "step": 3808000 + }, + { + "epoch": 1.183927680468535, + "grad_norm": 9.191658973693848, + "learning_rate": 3.0267871992191083e-05, + "loss": 2.8385, + "step": 3808500 + }, + { + "epoch": 1.184083112749022, + "grad_norm": 8.158698081970215, + "learning_rate": 3.026528145418297e-05, + "loss": 2.8534, + "step": 3809000 + }, + { + "epoch": 1.1842385450295088, + "grad_norm": 8.447013854980469, + "learning_rate": 3.026269091617485e-05, + "loss": 2.8687, + "step": 3809500 + }, + { + "epoch": 1.1843939773099956, + "grad_norm": 7.746267795562744, + "learning_rate": 3.0260100378166738e-05, + "loss": 2.8516, + "step": 3810000 + }, + { + "epoch": 1.1845494095904825, + "grad_norm": 10.166268348693848, + "learning_rate": 3.0257509840158625e-05, + "loss": 2.8092, + "step": 3810500 + }, + { + "epoch": 1.1847048418709694, + "grad_norm": 9.342206954956055, + "learning_rate": 3.025491930215051e-05, + "loss": 2.8036, + "step": 3811000 + }, + { + "epoch": 1.1848602741514562, + "grad_norm": 20.274639129638672, + "learning_rate": 3.0252328764142396e-05, + "loss": 2.8004, + "step": 3811500 + }, + { + "epoch": 1.185015706431943, + "grad_norm": 9.502274513244629, + "learning_rate": 3.0249738226134283e-05, + "loss": 2.8651, + "step": 3812000 + }, + { + "epoch": 1.18517113871243, + "grad_norm": 15.464046478271484, + "learning_rate": 3.0247147688126164e-05, + "loss": 2.8642, + "step": 3812500 + }, + { + "epoch": 1.1853265709929168, + "grad_norm": 11.667840957641602, + "learning_rate": 3.024455715011805e-05, + "loss": 2.8442, + "step": 3813000 + }, + { + "epoch": 1.185482003273404, + "grad_norm": 11.776911735534668, + "learning_rate": 3.024196661210994e-05, + "loss": 2.8453, + "step": 3813500 + }, + { + "epoch": 1.1856374355538908, + "grad_norm": 9.219079971313477, + "learning_rate": 3.0239376074101822e-05, + "loss": 2.8269, + "step": 3814000 + }, + { + "epoch": 1.1857928678343777, + "grad_norm": 6.077486515045166, + "learning_rate": 3.023678553609371e-05, + "loss": 2.8242, + "step": 3814500 + }, + { + "epoch": 1.1859483001148645, + "grad_norm": 8.30745792388916, + "learning_rate": 3.0234194998085596e-05, + "loss": 2.8666, + "step": 3815000 + }, + { + "epoch": 1.1861037323953514, + "grad_norm": 8.365728378295898, + "learning_rate": 3.023160446007748e-05, + "loss": 2.8141, + "step": 3815500 + }, + { + "epoch": 1.1862591646758383, + "grad_norm": 17.4384765625, + "learning_rate": 3.0229013922069367e-05, + "loss": 2.9063, + "step": 3816000 + }, + { + "epoch": 1.1864145969563251, + "grad_norm": 7.723490238189697, + "learning_rate": 3.0226423384061247e-05, + "loss": 2.8247, + "step": 3816500 + }, + { + "epoch": 1.186570029236812, + "grad_norm": 9.97923755645752, + "learning_rate": 3.0223832846053135e-05, + "loss": 2.8589, + "step": 3817000 + }, + { + "epoch": 1.1867254615172989, + "grad_norm": 8.808914184570312, + "learning_rate": 3.022124230804502e-05, + "loss": 2.8815, + "step": 3817500 + }, + { + "epoch": 1.1868808937977857, + "grad_norm": 8.958978652954102, + "learning_rate": 3.0218651770036905e-05, + "loss": 2.8936, + "step": 3818000 + }, + { + "epoch": 1.1870363260782726, + "grad_norm": 8.664542198181152, + "learning_rate": 3.0216061232028793e-05, + "loss": 2.8208, + "step": 3818500 + }, + { + "epoch": 1.1871917583587595, + "grad_norm": 9.350676536560059, + "learning_rate": 3.021347069402068e-05, + "loss": 2.8752, + "step": 3819000 + }, + { + "epoch": 1.1873471906392463, + "grad_norm": 8.487968444824219, + "learning_rate": 3.021088015601256e-05, + "loss": 2.8665, + "step": 3819500 + }, + { + "epoch": 1.1875026229197332, + "grad_norm": 18.242496490478516, + "learning_rate": 3.0208289618004447e-05, + "loss": 2.8228, + "step": 3820000 + }, + { + "epoch": 1.18765805520022, + "grad_norm": 9.336053848266602, + "learning_rate": 3.0205699079996334e-05, + "loss": 2.8532, + "step": 3820500 + }, + { + "epoch": 1.187813487480707, + "grad_norm": 8.29316520690918, + "learning_rate": 3.0203108541988218e-05, + "loss": 2.8534, + "step": 3821000 + }, + { + "epoch": 1.1879689197611938, + "grad_norm": 36.068660736083984, + "learning_rate": 3.0200518003980105e-05, + "loss": 2.8385, + "step": 3821500 + }, + { + "epoch": 1.1881243520416807, + "grad_norm": 11.558265686035156, + "learning_rate": 3.0197927465971986e-05, + "loss": 2.8316, + "step": 3822000 + }, + { + "epoch": 1.1882797843221675, + "grad_norm": 8.886090278625488, + "learning_rate": 3.0195336927963873e-05, + "loss": 2.8848, + "step": 3822500 + }, + { + "epoch": 1.1884352166026544, + "grad_norm": 9.027069091796875, + "learning_rate": 3.019274638995576e-05, + "loss": 2.8343, + "step": 3823000 + }, + { + "epoch": 1.1885906488831413, + "grad_norm": 9.602248191833496, + "learning_rate": 3.0190155851947644e-05, + "loss": 2.8146, + "step": 3823500 + }, + { + "epoch": 1.1887460811636283, + "grad_norm": 10.498003005981445, + "learning_rate": 3.018756531393953e-05, + "loss": 2.8631, + "step": 3824000 + }, + { + "epoch": 1.1889015134441152, + "grad_norm": 10.560267448425293, + "learning_rate": 3.0184974775931418e-05, + "loss": 2.8247, + "step": 3824500 + }, + { + "epoch": 1.189056945724602, + "grad_norm": 8.019536972045898, + "learning_rate": 3.01823842379233e-05, + "loss": 2.8142, + "step": 3825000 + }, + { + "epoch": 1.189212378005089, + "grad_norm": 26.51106834411621, + "learning_rate": 3.017979369991519e-05, + "loss": 2.8419, + "step": 3825500 + }, + { + "epoch": 1.1893678102855758, + "grad_norm": 9.506032943725586, + "learning_rate": 3.0177203161907076e-05, + "loss": 2.8376, + "step": 3826000 + }, + { + "epoch": 1.1895232425660627, + "grad_norm": 8.814345359802246, + "learning_rate": 3.0174612623898957e-05, + "loss": 2.8914, + "step": 3826500 + }, + { + "epoch": 1.1896786748465495, + "grad_norm": 9.50777530670166, + "learning_rate": 3.0172022085890844e-05, + "loss": 2.8015, + "step": 3827000 + }, + { + "epoch": 1.1898341071270364, + "grad_norm": 9.716754913330078, + "learning_rate": 3.0169431547882727e-05, + "loss": 2.8833, + "step": 3827500 + }, + { + "epoch": 1.1899895394075233, + "grad_norm": 7.359256267547607, + "learning_rate": 3.0166841009874615e-05, + "loss": 2.8262, + "step": 3828000 + }, + { + "epoch": 1.1901449716880101, + "grad_norm": 15.809903144836426, + "learning_rate": 3.0164250471866502e-05, + "loss": 2.8211, + "step": 3828500 + }, + { + "epoch": 1.190300403968497, + "grad_norm": 9.89726734161377, + "learning_rate": 3.0161659933858382e-05, + "loss": 2.8694, + "step": 3829000 + }, + { + "epoch": 1.1904558362489839, + "grad_norm": 9.920437812805176, + "learning_rate": 3.015906939585027e-05, + "loss": 2.8273, + "step": 3829500 + }, + { + "epoch": 1.1906112685294707, + "grad_norm": 9.33663272857666, + "learning_rate": 3.0156478857842156e-05, + "loss": 2.8614, + "step": 3830000 + }, + { + "epoch": 1.1907667008099576, + "grad_norm": 9.605875015258789, + "learning_rate": 3.015388831983404e-05, + "loss": 2.8406, + "step": 3830500 + }, + { + "epoch": 1.1909221330904445, + "grad_norm": 7.570755481719971, + "learning_rate": 3.0151297781825927e-05, + "loss": 2.8331, + "step": 3831000 + }, + { + "epoch": 1.1910775653709313, + "grad_norm": 9.797012329101562, + "learning_rate": 3.0148707243817815e-05, + "loss": 2.8132, + "step": 3831500 + }, + { + "epoch": 1.1912329976514182, + "grad_norm": 12.881915092468262, + "learning_rate": 3.0146116705809695e-05, + "loss": 2.8034, + "step": 3832000 + }, + { + "epoch": 1.191388429931905, + "grad_norm": 8.097460746765137, + "learning_rate": 3.0143526167801582e-05, + "loss": 2.822, + "step": 3832500 + }, + { + "epoch": 1.191543862212392, + "grad_norm": 8.907685279846191, + "learning_rate": 3.014093562979347e-05, + "loss": 2.8626, + "step": 3833000 + }, + { + "epoch": 1.1916992944928788, + "grad_norm": 8.073929786682129, + "learning_rate": 3.0138345091785353e-05, + "loss": 2.8367, + "step": 3833500 + }, + { + "epoch": 1.1918547267733657, + "grad_norm": 7.702851295471191, + "learning_rate": 3.013575455377724e-05, + "loss": 2.8423, + "step": 3834000 + }, + { + "epoch": 1.1920101590538525, + "grad_norm": 8.910978317260742, + "learning_rate": 3.013316401576912e-05, + "loss": 2.8355, + "step": 3834500 + }, + { + "epoch": 1.1921655913343394, + "grad_norm": 10.560627937316895, + "learning_rate": 3.013057347776101e-05, + "loss": 2.8633, + "step": 3835000 + }, + { + "epoch": 1.1923210236148263, + "grad_norm": 13.105672836303711, + "learning_rate": 3.0127982939752898e-05, + "loss": 2.8135, + "step": 3835500 + }, + { + "epoch": 1.1924764558953131, + "grad_norm": 15.87173843383789, + "learning_rate": 3.012539240174478e-05, + "loss": 2.8979, + "step": 3836000 + }, + { + "epoch": 1.1926318881758, + "grad_norm": 15.442078590393066, + "learning_rate": 3.0122801863736666e-05, + "loss": 2.8362, + "step": 3836500 + }, + { + "epoch": 1.1927873204562869, + "grad_norm": 10.082051277160645, + "learning_rate": 3.0120211325728553e-05, + "loss": 2.8257, + "step": 3837000 + }, + { + "epoch": 1.192942752736774, + "grad_norm": 9.259586334228516, + "learning_rate": 3.0117620787720437e-05, + "loss": 2.8417, + "step": 3837500 + }, + { + "epoch": 1.1930981850172608, + "grad_norm": 12.738382339477539, + "learning_rate": 3.0115030249712324e-05, + "loss": 2.8462, + "step": 3838000 + }, + { + "epoch": 1.1932536172977477, + "grad_norm": 21.480806350708008, + "learning_rate": 3.011243971170421e-05, + "loss": 2.8673, + "step": 3838500 + }, + { + "epoch": 1.1934090495782346, + "grad_norm": 17.769563674926758, + "learning_rate": 3.010984917369609e-05, + "loss": 2.8452, + "step": 3839000 + }, + { + "epoch": 1.1935644818587214, + "grad_norm": 7.622185707092285, + "learning_rate": 3.010725863568798e-05, + "loss": 2.8194, + "step": 3839500 + }, + { + "epoch": 1.1937199141392083, + "grad_norm": 7.824098587036133, + "learning_rate": 3.0104668097679862e-05, + "loss": 2.8856, + "step": 3840000 + }, + { + "epoch": 1.1938753464196952, + "grad_norm": 8.586484909057617, + "learning_rate": 3.010207755967175e-05, + "loss": 2.8621, + "step": 3840500 + }, + { + "epoch": 1.194030778700182, + "grad_norm": 20.534748077392578, + "learning_rate": 3.0099487021663637e-05, + "loss": 2.8803, + "step": 3841000 + }, + { + "epoch": 1.194186210980669, + "grad_norm": 10.054141998291016, + "learning_rate": 3.0096896483655517e-05, + "loss": 2.8222, + "step": 3841500 + }, + { + "epoch": 1.1943416432611558, + "grad_norm": 8.35361385345459, + "learning_rate": 3.0094305945647404e-05, + "loss": 2.8386, + "step": 3842000 + }, + { + "epoch": 1.1944970755416426, + "grad_norm": 10.025413513183594, + "learning_rate": 3.009171540763929e-05, + "loss": 2.82, + "step": 3842500 + }, + { + "epoch": 1.1946525078221295, + "grad_norm": 6.756059646606445, + "learning_rate": 3.0089124869631175e-05, + "loss": 2.873, + "step": 3843000 + }, + { + "epoch": 1.1948079401026164, + "grad_norm": 11.002791404724121, + "learning_rate": 3.0086534331623062e-05, + "loss": 2.8079, + "step": 3843500 + }, + { + "epoch": 1.1949633723831032, + "grad_norm": 44.74539566040039, + "learning_rate": 3.008394379361495e-05, + "loss": 2.8429, + "step": 3844000 + }, + { + "epoch": 1.19511880466359, + "grad_norm": 10.236919403076172, + "learning_rate": 3.008135325560683e-05, + "loss": 2.838, + "step": 3844500 + }, + { + "epoch": 1.195274236944077, + "grad_norm": 12.763550758361816, + "learning_rate": 3.007876271759872e-05, + "loss": 2.8442, + "step": 3845000 + }, + { + "epoch": 1.1954296692245638, + "grad_norm": 9.023157119750977, + "learning_rate": 3.00761721795906e-05, + "loss": 2.8392, + "step": 3845500 + }, + { + "epoch": 1.1955851015050507, + "grad_norm": 8.482497215270996, + "learning_rate": 3.0073581641582488e-05, + "loss": 2.9142, + "step": 3846000 + }, + { + "epoch": 1.1957405337855376, + "grad_norm": 11.632247924804688, + "learning_rate": 3.0070991103574375e-05, + "loss": 2.7889, + "step": 3846500 + }, + { + "epoch": 1.1958959660660244, + "grad_norm": 9.132370948791504, + "learning_rate": 3.006840056556626e-05, + "loss": 2.8236, + "step": 3847000 + }, + { + "epoch": 1.1960513983465113, + "grad_norm": 12.238286018371582, + "learning_rate": 3.0065810027558146e-05, + "loss": 2.8427, + "step": 3847500 + }, + { + "epoch": 1.1962068306269984, + "grad_norm": 8.037822723388672, + "learning_rate": 3.0063219489550033e-05, + "loss": 2.8364, + "step": 3848000 + }, + { + "epoch": 1.1963622629074853, + "grad_norm": 8.234611511230469, + "learning_rate": 3.0060628951541913e-05, + "loss": 2.848, + "step": 3848500 + }, + { + "epoch": 1.1965176951879721, + "grad_norm": 9.551069259643555, + "learning_rate": 3.00580384135338e-05, + "loss": 2.8381, + "step": 3849000 + }, + { + "epoch": 1.196673127468459, + "grad_norm": 8.7236909866333, + "learning_rate": 3.0055447875525688e-05, + "loss": 2.876, + "step": 3849500 + }, + { + "epoch": 1.1968285597489459, + "grad_norm": 9.549074172973633, + "learning_rate": 3.005285733751757e-05, + "loss": 2.8745, + "step": 3850000 + }, + { + "epoch": 1.1969839920294327, + "grad_norm": 9.60888385772705, + "learning_rate": 3.005026679950946e-05, + "loss": 2.8266, + "step": 3850500 + }, + { + "epoch": 1.1971394243099196, + "grad_norm": 9.992817878723145, + "learning_rate": 3.0047676261501346e-05, + "loss": 2.7948, + "step": 3851000 + }, + { + "epoch": 1.1972948565904065, + "grad_norm": 11.295676231384277, + "learning_rate": 3.0045085723493226e-05, + "loss": 2.8688, + "step": 3851500 + }, + { + "epoch": 1.1974502888708933, + "grad_norm": 10.700592994689941, + "learning_rate": 3.0042495185485113e-05, + "loss": 2.8382, + "step": 3852000 + }, + { + "epoch": 1.1976057211513802, + "grad_norm": 8.405521392822266, + "learning_rate": 3.0039904647476997e-05, + "loss": 2.8257, + "step": 3852500 + }, + { + "epoch": 1.197761153431867, + "grad_norm": 15.658171653747559, + "learning_rate": 3.0037314109468884e-05, + "loss": 2.8995, + "step": 3853000 + }, + { + "epoch": 1.197916585712354, + "grad_norm": 12.539555549621582, + "learning_rate": 3.003472357146077e-05, + "loss": 2.8049, + "step": 3853500 + }, + { + "epoch": 1.1980720179928408, + "grad_norm": 16.83293342590332, + "learning_rate": 3.0032133033452652e-05, + "loss": 2.8998, + "step": 3854000 + }, + { + "epoch": 1.1982274502733277, + "grad_norm": 9.656598091125488, + "learning_rate": 3.002954249544454e-05, + "loss": 2.8374, + "step": 3854500 + }, + { + "epoch": 1.1983828825538145, + "grad_norm": 11.894771575927734, + "learning_rate": 3.002695195743643e-05, + "loss": 2.8449, + "step": 3855000 + }, + { + "epoch": 1.1985383148343014, + "grad_norm": 9.639336585998535, + "learning_rate": 3.002436141942831e-05, + "loss": 2.8549, + "step": 3855500 + }, + { + "epoch": 1.1986937471147883, + "grad_norm": 5.316936016082764, + "learning_rate": 3.0021770881420197e-05, + "loss": 2.8659, + "step": 3856000 + }, + { + "epoch": 1.1988491793952751, + "grad_norm": 10.374924659729004, + "learning_rate": 3.0019180343412084e-05, + "loss": 2.8763, + "step": 3856500 + }, + { + "epoch": 1.199004611675762, + "grad_norm": 8.158703804016113, + "learning_rate": 3.0016589805403968e-05, + "loss": 2.8495, + "step": 3857000 + }, + { + "epoch": 1.1991600439562489, + "grad_norm": 8.491128921508789, + "learning_rate": 3.0013999267395855e-05, + "loss": 2.8715, + "step": 3857500 + }, + { + "epoch": 1.1993154762367357, + "grad_norm": 8.316604614257812, + "learning_rate": 3.0011408729387735e-05, + "loss": 2.8527, + "step": 3858000 + }, + { + "epoch": 1.1994709085172226, + "grad_norm": 8.681838035583496, + "learning_rate": 3.0008818191379623e-05, + "loss": 2.8551, + "step": 3858500 + }, + { + "epoch": 1.1996263407977095, + "grad_norm": 7.734654426574707, + "learning_rate": 3.000622765337151e-05, + "loss": 2.8018, + "step": 3859000 + }, + { + "epoch": 1.1997817730781963, + "grad_norm": 6.8279547691345215, + "learning_rate": 3.0003637115363393e-05, + "loss": 2.8281, + "step": 3859500 + }, + { + "epoch": 1.1999372053586832, + "grad_norm": 10.61911392211914, + "learning_rate": 3.000104657735528e-05, + "loss": 2.8281, + "step": 3860000 + }, + { + "epoch": 1.20009263763917, + "grad_norm": 9.593121528625488, + "learning_rate": 2.9998456039347168e-05, + "loss": 2.8236, + "step": 3860500 + }, + { + "epoch": 1.200248069919657, + "grad_norm": 7.54515266418457, + "learning_rate": 2.9995865501339048e-05, + "loss": 2.7968, + "step": 3861000 + }, + { + "epoch": 1.200403502200144, + "grad_norm": 10.734244346618652, + "learning_rate": 2.9993274963330935e-05, + "loss": 2.8275, + "step": 3861500 + }, + { + "epoch": 1.200558934480631, + "grad_norm": 7.357810974121094, + "learning_rate": 2.9990684425322822e-05, + "loss": 2.8858, + "step": 3862000 + }, + { + "epoch": 1.2007143667611178, + "grad_norm": 11.343700408935547, + "learning_rate": 2.9988093887314706e-05, + "loss": 2.8207, + "step": 3862500 + }, + { + "epoch": 1.2008697990416046, + "grad_norm": 11.52673053741455, + "learning_rate": 2.9985503349306593e-05, + "loss": 2.8197, + "step": 3863000 + }, + { + "epoch": 1.2010252313220915, + "grad_norm": 9.586836814880371, + "learning_rate": 2.998291281129848e-05, + "loss": 2.8595, + "step": 3863500 + }, + { + "epoch": 1.2011806636025784, + "grad_norm": 9.336026191711426, + "learning_rate": 2.998032227329036e-05, + "loss": 2.856, + "step": 3864000 + }, + { + "epoch": 1.2013360958830652, + "grad_norm": 8.104730606079102, + "learning_rate": 2.9977731735282248e-05, + "loss": 2.8107, + "step": 3864500 + }, + { + "epoch": 1.201491528163552, + "grad_norm": 11.192277908325195, + "learning_rate": 2.9975141197274132e-05, + "loss": 2.7973, + "step": 3865000 + }, + { + "epoch": 1.201646960444039, + "grad_norm": 9.23980712890625, + "learning_rate": 2.997255065926602e-05, + "loss": 2.8411, + "step": 3865500 + }, + { + "epoch": 1.2018023927245258, + "grad_norm": 8.454206466674805, + "learning_rate": 2.9969960121257906e-05, + "loss": 2.8064, + "step": 3866000 + }, + { + "epoch": 1.2019578250050127, + "grad_norm": 8.662186622619629, + "learning_rate": 2.9967369583249787e-05, + "loss": 2.8847, + "step": 3866500 + }, + { + "epoch": 1.2021132572854996, + "grad_norm": 14.329809188842773, + "learning_rate": 2.9964779045241677e-05, + "loss": 2.8665, + "step": 3867000 + }, + { + "epoch": 1.2022686895659864, + "grad_norm": 9.843072891235352, + "learning_rate": 2.9962188507233564e-05, + "loss": 2.8124, + "step": 3867500 + }, + { + "epoch": 1.2024241218464733, + "grad_norm": 8.996434211730957, + "learning_rate": 2.9959597969225445e-05, + "loss": 2.8699, + "step": 3868000 + }, + { + "epoch": 1.2025795541269602, + "grad_norm": 8.991848945617676, + "learning_rate": 2.9957007431217332e-05, + "loss": 2.8627, + "step": 3868500 + }, + { + "epoch": 1.202734986407447, + "grad_norm": 8.835808753967285, + "learning_rate": 2.995441689320922e-05, + "loss": 2.8618, + "step": 3869000 + }, + { + "epoch": 1.202890418687934, + "grad_norm": 8.879846572875977, + "learning_rate": 2.9951826355201103e-05, + "loss": 2.8518, + "step": 3869500 + }, + { + "epoch": 1.2030458509684208, + "grad_norm": 8.397418975830078, + "learning_rate": 2.994923581719299e-05, + "loss": 2.8467, + "step": 3870000 + }, + { + "epoch": 1.2032012832489076, + "grad_norm": 6.57659912109375, + "learning_rate": 2.994664527918487e-05, + "loss": 2.8082, + "step": 3870500 + }, + { + "epoch": 1.2033567155293945, + "grad_norm": 13.47642707824707, + "learning_rate": 2.9944054741176757e-05, + "loss": 2.8243, + "step": 3871000 + }, + { + "epoch": 1.2035121478098814, + "grad_norm": 16.732379913330078, + "learning_rate": 2.9941464203168644e-05, + "loss": 2.8514, + "step": 3871500 + }, + { + "epoch": 1.2036675800903684, + "grad_norm": 7.716498851776123, + "learning_rate": 2.9938873665160528e-05, + "loss": 2.8096, + "step": 3872000 + }, + { + "epoch": 1.2038230123708553, + "grad_norm": 9.510353088378906, + "learning_rate": 2.9936283127152415e-05, + "loss": 2.8511, + "step": 3872500 + }, + { + "epoch": 1.2039784446513422, + "grad_norm": 39.3825569152832, + "learning_rate": 2.9933692589144303e-05, + "loss": 2.8603, + "step": 3873000 + }, + { + "epoch": 1.204133876931829, + "grad_norm": 15.061357498168945, + "learning_rate": 2.9931102051136183e-05, + "loss": 2.8358, + "step": 3873500 + }, + { + "epoch": 1.204289309212316, + "grad_norm": 9.129862785339355, + "learning_rate": 2.992851151312807e-05, + "loss": 2.8253, + "step": 3874000 + }, + { + "epoch": 1.2044447414928028, + "grad_norm": 8.115663528442383, + "learning_rate": 2.9925920975119957e-05, + "loss": 2.8488, + "step": 3874500 + }, + { + "epoch": 1.2046001737732897, + "grad_norm": 8.533210754394531, + "learning_rate": 2.992333043711184e-05, + "loss": 2.8143, + "step": 3875000 + }, + { + "epoch": 1.2047556060537765, + "grad_norm": 8.741473197937012, + "learning_rate": 2.9920739899103728e-05, + "loss": 2.8487, + "step": 3875500 + }, + { + "epoch": 1.2049110383342634, + "grad_norm": 9.421091079711914, + "learning_rate": 2.991814936109561e-05, + "loss": 2.8395, + "step": 3876000 + }, + { + "epoch": 1.2050664706147503, + "grad_norm": 7.429145336151123, + "learning_rate": 2.9915558823087496e-05, + "loss": 2.8284, + "step": 3876500 + }, + { + "epoch": 1.2052219028952371, + "grad_norm": 7.833978176116943, + "learning_rate": 2.9912968285079386e-05, + "loss": 2.8465, + "step": 3877000 + }, + { + "epoch": 1.205377335175724, + "grad_norm": 11.525514602661133, + "learning_rate": 2.9910377747071267e-05, + "loss": 2.8587, + "step": 3877500 + }, + { + "epoch": 1.2055327674562109, + "grad_norm": 13.784784317016602, + "learning_rate": 2.9907787209063154e-05, + "loss": 2.8439, + "step": 3878000 + }, + { + "epoch": 1.2056881997366977, + "grad_norm": 7.88787317276001, + "learning_rate": 2.990519667105504e-05, + "loss": 2.8188, + "step": 3878500 + }, + { + "epoch": 1.2058436320171846, + "grad_norm": 16.558908462524414, + "learning_rate": 2.9902606133046925e-05, + "loss": 2.8582, + "step": 3879000 + }, + { + "epoch": 1.2059990642976715, + "grad_norm": 16.88605499267578, + "learning_rate": 2.9900015595038812e-05, + "loss": 2.8648, + "step": 3879500 + }, + { + "epoch": 1.2061544965781583, + "grad_norm": 8.353869438171387, + "learning_rate": 2.98974250570307e-05, + "loss": 2.8731, + "step": 3880000 + }, + { + "epoch": 1.2063099288586452, + "grad_norm": 9.583057403564453, + "learning_rate": 2.989483451902258e-05, + "loss": 2.8642, + "step": 3880500 + }, + { + "epoch": 1.206465361139132, + "grad_norm": 8.609167098999023, + "learning_rate": 2.9892243981014467e-05, + "loss": 2.8414, + "step": 3881000 + }, + { + "epoch": 1.206620793419619, + "grad_norm": 8.925564765930176, + "learning_rate": 2.9889653443006354e-05, + "loss": 2.7979, + "step": 3881500 + }, + { + "epoch": 1.2067762257001058, + "grad_norm": 8.05872631072998, + "learning_rate": 2.9887062904998237e-05, + "loss": 2.9136, + "step": 3882000 + }, + { + "epoch": 1.2069316579805927, + "grad_norm": 38.894927978515625, + "learning_rate": 2.9884472366990125e-05, + "loss": 2.8215, + "step": 3882500 + }, + { + "epoch": 1.2070870902610795, + "grad_norm": 9.787967681884766, + "learning_rate": 2.9881881828982005e-05, + "loss": 2.8558, + "step": 3883000 + }, + { + "epoch": 1.2072425225415664, + "grad_norm": 8.858201026916504, + "learning_rate": 2.9879291290973892e-05, + "loss": 2.8639, + "step": 3883500 + }, + { + "epoch": 1.2073979548220533, + "grad_norm": 8.286060333251953, + "learning_rate": 2.987670075296578e-05, + "loss": 2.8402, + "step": 3884000 + }, + { + "epoch": 1.2075533871025401, + "grad_norm": 7.985448837280273, + "learning_rate": 2.9874110214957663e-05, + "loss": 2.8552, + "step": 3884500 + }, + { + "epoch": 1.207708819383027, + "grad_norm": 9.004388809204102, + "learning_rate": 2.987151967694955e-05, + "loss": 2.8784, + "step": 3885000 + }, + { + "epoch": 1.207864251663514, + "grad_norm": 33.19658660888672, + "learning_rate": 2.9868929138941437e-05, + "loss": 2.8183, + "step": 3885500 + }, + { + "epoch": 1.208019683944001, + "grad_norm": 9.169126510620117, + "learning_rate": 2.9866338600933318e-05, + "loss": 2.8528, + "step": 3886000 + }, + { + "epoch": 1.2081751162244878, + "grad_norm": 6.985393047332764, + "learning_rate": 2.9863748062925205e-05, + "loss": 2.833, + "step": 3886500 + }, + { + "epoch": 1.2083305485049747, + "grad_norm": 10.03382396697998, + "learning_rate": 2.9861157524917095e-05, + "loss": 2.8149, + "step": 3887000 + }, + { + "epoch": 1.2084859807854615, + "grad_norm": 55.48240280151367, + "learning_rate": 2.9858566986908976e-05, + "loss": 2.8631, + "step": 3887500 + }, + { + "epoch": 1.2086414130659484, + "grad_norm": 27.82896614074707, + "learning_rate": 2.9855976448900863e-05, + "loss": 2.8236, + "step": 3888000 + }, + { + "epoch": 1.2087968453464353, + "grad_norm": 9.306478500366211, + "learning_rate": 2.9853385910892743e-05, + "loss": 2.8715, + "step": 3888500 + }, + { + "epoch": 1.2089522776269221, + "grad_norm": 8.685315132141113, + "learning_rate": 2.9850795372884634e-05, + "loss": 2.8039, + "step": 3889000 + }, + { + "epoch": 1.209107709907409, + "grad_norm": 15.625133514404297, + "learning_rate": 2.984820483487652e-05, + "loss": 2.8814, + "step": 3889500 + }, + { + "epoch": 1.2092631421878959, + "grad_norm": 15.749161720275879, + "learning_rate": 2.98456142968684e-05, + "loss": 2.8603, + "step": 3890000 + }, + { + "epoch": 1.2094185744683827, + "grad_norm": 9.6488037109375, + "learning_rate": 2.984302375886029e-05, + "loss": 2.8614, + "step": 3890500 + }, + { + "epoch": 1.2095740067488696, + "grad_norm": 8.060540199279785, + "learning_rate": 2.9840433220852176e-05, + "loss": 2.8539, + "step": 3891000 + }, + { + "epoch": 1.2097294390293565, + "grad_norm": 12.320846557617188, + "learning_rate": 2.983784268284406e-05, + "loss": 2.8349, + "step": 3891500 + }, + { + "epoch": 1.2098848713098433, + "grad_norm": 8.518173217773438, + "learning_rate": 2.9835252144835947e-05, + "loss": 2.8107, + "step": 3892000 + }, + { + "epoch": 1.2100403035903302, + "grad_norm": 11.723852157592773, + "learning_rate": 2.9832661606827834e-05, + "loss": 2.8111, + "step": 3892500 + }, + { + "epoch": 1.210195735870817, + "grad_norm": 9.390902519226074, + "learning_rate": 2.9830071068819714e-05, + "loss": 2.8409, + "step": 3893000 + }, + { + "epoch": 1.210351168151304, + "grad_norm": 8.102516174316406, + "learning_rate": 2.98274805308116e-05, + "loss": 2.7975, + "step": 3893500 + }, + { + "epoch": 1.2105066004317908, + "grad_norm": 11.520282745361328, + "learning_rate": 2.9824889992803485e-05, + "loss": 2.8293, + "step": 3894000 + }, + { + "epoch": 1.2106620327122777, + "grad_norm": 7.0557026863098145, + "learning_rate": 2.9822299454795372e-05, + "loss": 2.7632, + "step": 3894500 + }, + { + "epoch": 1.2108174649927645, + "grad_norm": 9.495705604553223, + "learning_rate": 2.981970891678726e-05, + "loss": 2.8302, + "step": 3895000 + }, + { + "epoch": 1.2109728972732514, + "grad_norm": 10.576446533203125, + "learning_rate": 2.981711837877914e-05, + "loss": 2.8098, + "step": 3895500 + }, + { + "epoch": 1.2111283295537385, + "grad_norm": 7.936706066131592, + "learning_rate": 2.9814527840771027e-05, + "loss": 2.8517, + "step": 3896000 + }, + { + "epoch": 1.2112837618342254, + "grad_norm": 9.22834300994873, + "learning_rate": 2.9811937302762914e-05, + "loss": 2.8384, + "step": 3896500 + }, + { + "epoch": 1.2114391941147122, + "grad_norm": 7.721888065338135, + "learning_rate": 2.9809346764754798e-05, + "loss": 2.8057, + "step": 3897000 + }, + { + "epoch": 1.211594626395199, + "grad_norm": 9.604053497314453, + "learning_rate": 2.9806756226746685e-05, + "loss": 2.8666, + "step": 3897500 + }, + { + "epoch": 1.211750058675686, + "grad_norm": 12.522049903869629, + "learning_rate": 2.9804165688738572e-05, + "loss": 2.8608, + "step": 3898000 + }, + { + "epoch": 1.2119054909561728, + "grad_norm": 10.580018043518066, + "learning_rate": 2.9801575150730453e-05, + "loss": 2.8747, + "step": 3898500 + }, + { + "epoch": 1.2120609232366597, + "grad_norm": 8.609994888305664, + "learning_rate": 2.9798984612722343e-05, + "loss": 2.836, + "step": 3899000 + }, + { + "epoch": 1.2122163555171466, + "grad_norm": 9.2891845703125, + "learning_rate": 2.979639407471423e-05, + "loss": 2.8527, + "step": 3899500 + }, + { + "epoch": 1.2123717877976334, + "grad_norm": 9.062604904174805, + "learning_rate": 2.979380353670611e-05, + "loss": 2.8462, + "step": 3900000 + }, + { + "epoch": 1.2125272200781203, + "grad_norm": 11.786718368530273, + "learning_rate": 2.9791212998697998e-05, + "loss": 2.8315, + "step": 3900500 + }, + { + "epoch": 1.2126826523586072, + "grad_norm": 9.000544548034668, + "learning_rate": 2.978862246068988e-05, + "loss": 2.8304, + "step": 3901000 + }, + { + "epoch": 1.212838084639094, + "grad_norm": 9.649978637695312, + "learning_rate": 2.978603192268177e-05, + "loss": 2.8124, + "step": 3901500 + }, + { + "epoch": 1.212993516919581, + "grad_norm": 7.6739091873168945, + "learning_rate": 2.9783441384673656e-05, + "loss": 2.8301, + "step": 3902000 + }, + { + "epoch": 1.2131489492000678, + "grad_norm": 7.205316066741943, + "learning_rate": 2.9780850846665536e-05, + "loss": 2.8833, + "step": 3902500 + }, + { + "epoch": 1.2133043814805546, + "grad_norm": 8.07262134552002, + "learning_rate": 2.9778260308657423e-05, + "loss": 2.8518, + "step": 3903000 + }, + { + "epoch": 1.2134598137610415, + "grad_norm": 7.225470066070557, + "learning_rate": 2.977566977064931e-05, + "loss": 2.8555, + "step": 3903500 + }, + { + "epoch": 1.2136152460415284, + "grad_norm": 10.234728813171387, + "learning_rate": 2.9773079232641194e-05, + "loss": 2.8442, + "step": 3904000 + }, + { + "epoch": 1.2137706783220152, + "grad_norm": 8.558043479919434, + "learning_rate": 2.977048869463308e-05, + "loss": 2.7571, + "step": 3904500 + }, + { + "epoch": 1.213926110602502, + "grad_norm": 9.109935760498047, + "learning_rate": 2.976789815662497e-05, + "loss": 2.7928, + "step": 3905000 + }, + { + "epoch": 1.214081542882989, + "grad_norm": 11.03509521484375, + "learning_rate": 2.976530761861685e-05, + "loss": 2.8454, + "step": 3905500 + }, + { + "epoch": 1.2142369751634758, + "grad_norm": 7.348136901855469, + "learning_rate": 2.9762717080608736e-05, + "loss": 2.8311, + "step": 3906000 + }, + { + "epoch": 1.2143924074439627, + "grad_norm": 12.029099464416504, + "learning_rate": 2.976012654260062e-05, + "loss": 2.837, + "step": 3906500 + }, + { + "epoch": 1.2145478397244496, + "grad_norm": 10.431262969970703, + "learning_rate": 2.9757536004592507e-05, + "loss": 2.8728, + "step": 3907000 + }, + { + "epoch": 1.2147032720049364, + "grad_norm": 8.95716381072998, + "learning_rate": 2.9754945466584394e-05, + "loss": 2.7751, + "step": 3907500 + }, + { + "epoch": 1.2148587042854233, + "grad_norm": 10.576210975646973, + "learning_rate": 2.9752354928576275e-05, + "loss": 2.8267, + "step": 3908000 + }, + { + "epoch": 1.2150141365659102, + "grad_norm": 9.687165260314941, + "learning_rate": 2.9749764390568162e-05, + "loss": 2.867, + "step": 3908500 + }, + { + "epoch": 1.215169568846397, + "grad_norm": 10.64867115020752, + "learning_rate": 2.9747173852560052e-05, + "loss": 2.8649, + "step": 3909000 + }, + { + "epoch": 1.2153250011268841, + "grad_norm": 10.47544002532959, + "learning_rate": 2.9744583314551933e-05, + "loss": 2.8169, + "step": 3909500 + }, + { + "epoch": 1.215480433407371, + "grad_norm": 11.917901039123535, + "learning_rate": 2.974199277654382e-05, + "loss": 2.8579, + "step": 3910000 + }, + { + "epoch": 1.2156358656878579, + "grad_norm": 9.13109016418457, + "learning_rate": 2.9739402238535707e-05, + "loss": 2.8509, + "step": 3910500 + }, + { + "epoch": 1.2157912979683447, + "grad_norm": 10.719996452331543, + "learning_rate": 2.973681170052759e-05, + "loss": 2.8584, + "step": 3911000 + }, + { + "epoch": 1.2159467302488316, + "grad_norm": 9.524964332580566, + "learning_rate": 2.9734221162519478e-05, + "loss": 2.8437, + "step": 3911500 + }, + { + "epoch": 1.2161021625293185, + "grad_norm": 10.682878494262695, + "learning_rate": 2.9731630624511358e-05, + "loss": 2.8594, + "step": 3912000 + }, + { + "epoch": 1.2162575948098053, + "grad_norm": 9.472940444946289, + "learning_rate": 2.9729040086503245e-05, + "loss": 2.847, + "step": 3912500 + }, + { + "epoch": 1.2164130270902922, + "grad_norm": 9.232732772827148, + "learning_rate": 2.9726449548495133e-05, + "loss": 2.8193, + "step": 3913000 + }, + { + "epoch": 1.216568459370779, + "grad_norm": 10.123076438903809, + "learning_rate": 2.9723859010487016e-05, + "loss": 2.7968, + "step": 3913500 + }, + { + "epoch": 1.216723891651266, + "grad_norm": 10.846146583557129, + "learning_rate": 2.9721268472478903e-05, + "loss": 2.8259, + "step": 3914000 + }, + { + "epoch": 1.2168793239317528, + "grad_norm": 9.140238761901855, + "learning_rate": 2.971867793447079e-05, + "loss": 2.8486, + "step": 3914500 + }, + { + "epoch": 1.2170347562122397, + "grad_norm": 14.626500129699707, + "learning_rate": 2.971608739646267e-05, + "loss": 2.8491, + "step": 3915000 + }, + { + "epoch": 1.2171901884927265, + "grad_norm": 11.196359634399414, + "learning_rate": 2.9713496858454558e-05, + "loss": 2.7897, + "step": 3915500 + }, + { + "epoch": 1.2173456207732134, + "grad_norm": 8.29435920715332, + "learning_rate": 2.9710906320446445e-05, + "loss": 2.8389, + "step": 3916000 + }, + { + "epoch": 1.2175010530537003, + "grad_norm": 8.676379203796387, + "learning_rate": 2.970831578243833e-05, + "loss": 2.8166, + "step": 3916500 + }, + { + "epoch": 1.2176564853341871, + "grad_norm": 10.63679313659668, + "learning_rate": 2.9705725244430216e-05, + "loss": 2.8144, + "step": 3917000 + }, + { + "epoch": 1.217811917614674, + "grad_norm": 23.32145118713379, + "learning_rate": 2.9703134706422103e-05, + "loss": 2.8179, + "step": 3917500 + }, + { + "epoch": 1.2179673498951609, + "grad_norm": 11.114822387695312, + "learning_rate": 2.9700544168413984e-05, + "loss": 2.8632, + "step": 3918000 + }, + { + "epoch": 1.2181227821756477, + "grad_norm": 9.40905475616455, + "learning_rate": 2.969795363040587e-05, + "loss": 2.8134, + "step": 3918500 + }, + { + "epoch": 1.2182782144561346, + "grad_norm": 11.240846633911133, + "learning_rate": 2.9695363092397755e-05, + "loss": 2.8953, + "step": 3919000 + }, + { + "epoch": 1.2184336467366215, + "grad_norm": 9.653943061828613, + "learning_rate": 2.9692772554389642e-05, + "loss": 2.848, + "step": 3919500 + }, + { + "epoch": 1.2185890790171083, + "grad_norm": 9.233441352844238, + "learning_rate": 2.969018201638153e-05, + "loss": 2.8341, + "step": 3920000 + }, + { + "epoch": 1.2187445112975954, + "grad_norm": 8.668190956115723, + "learning_rate": 2.968759147837341e-05, + "loss": 2.8488, + "step": 3920500 + }, + { + "epoch": 1.2188999435780823, + "grad_norm": 9.57172679901123, + "learning_rate": 2.96850009403653e-05, + "loss": 2.7986, + "step": 3921000 + }, + { + "epoch": 1.2190553758585692, + "grad_norm": 9.987310409545898, + "learning_rate": 2.9682410402357187e-05, + "loss": 2.8316, + "step": 3921500 + }, + { + "epoch": 1.219210808139056, + "grad_norm": 11.272099494934082, + "learning_rate": 2.9679819864349067e-05, + "loss": 2.8678, + "step": 3922000 + }, + { + "epoch": 1.219366240419543, + "grad_norm": 9.264850616455078, + "learning_rate": 2.9677229326340955e-05, + "loss": 2.8157, + "step": 3922500 + }, + { + "epoch": 1.2195216727000298, + "grad_norm": 10.536489486694336, + "learning_rate": 2.9674638788332842e-05, + "loss": 2.8784, + "step": 3923000 + }, + { + "epoch": 1.2196771049805166, + "grad_norm": 10.180604934692383, + "learning_rate": 2.9672048250324725e-05, + "loss": 2.8565, + "step": 3923500 + }, + { + "epoch": 1.2198325372610035, + "grad_norm": 10.666486740112305, + "learning_rate": 2.9669457712316613e-05, + "loss": 2.8899, + "step": 3924000 + }, + { + "epoch": 1.2199879695414904, + "grad_norm": 15.207863807678223, + "learning_rate": 2.9666867174308493e-05, + "loss": 2.8607, + "step": 3924500 + }, + { + "epoch": 1.2201434018219772, + "grad_norm": 18.040616989135742, + "learning_rate": 2.966427663630038e-05, + "loss": 2.8497, + "step": 3925000 + }, + { + "epoch": 1.220298834102464, + "grad_norm": 14.184249877929688, + "learning_rate": 2.9661686098292267e-05, + "loss": 2.8051, + "step": 3925500 + }, + { + "epoch": 1.220454266382951, + "grad_norm": 10.19769287109375, + "learning_rate": 2.965909556028415e-05, + "loss": 2.8028, + "step": 3926000 + }, + { + "epoch": 1.2206096986634378, + "grad_norm": 9.111956596374512, + "learning_rate": 2.9656505022276038e-05, + "loss": 2.7621, + "step": 3926500 + }, + { + "epoch": 1.2207651309439247, + "grad_norm": 10.055797576904297, + "learning_rate": 2.9653914484267925e-05, + "loss": 2.8579, + "step": 3927000 + }, + { + "epoch": 1.2209205632244116, + "grad_norm": 7.58418607711792, + "learning_rate": 2.9651323946259806e-05, + "loss": 2.8202, + "step": 3927500 + }, + { + "epoch": 1.2210759955048984, + "grad_norm": 8.28561782836914, + "learning_rate": 2.9648733408251693e-05, + "loss": 2.8657, + "step": 3928000 + }, + { + "epoch": 1.2212314277853853, + "grad_norm": 9.585014343261719, + "learning_rate": 2.964614287024358e-05, + "loss": 2.8829, + "step": 3928500 + }, + { + "epoch": 1.2213868600658722, + "grad_norm": 9.526019096374512, + "learning_rate": 2.9643552332235464e-05, + "loss": 2.8296, + "step": 3929000 + }, + { + "epoch": 1.221542292346359, + "grad_norm": 11.542062759399414, + "learning_rate": 2.964096179422735e-05, + "loss": 2.8211, + "step": 3929500 + }, + { + "epoch": 1.221697724626846, + "grad_norm": 7.844536781311035, + "learning_rate": 2.963837125621923e-05, + "loss": 2.8793, + "step": 3930000 + }, + { + "epoch": 1.2218531569073328, + "grad_norm": 15.2218599319458, + "learning_rate": 2.9635780718211122e-05, + "loss": 2.8464, + "step": 3930500 + }, + { + "epoch": 1.2220085891878196, + "grad_norm": 12.30924129486084, + "learning_rate": 2.963319018020301e-05, + "loss": 2.8046, + "step": 3931000 + }, + { + "epoch": 1.2221640214683065, + "grad_norm": 8.48153018951416, + "learning_rate": 2.963059964219489e-05, + "loss": 2.8062, + "step": 3931500 + }, + { + "epoch": 1.2223194537487934, + "grad_norm": 9.341440200805664, + "learning_rate": 2.9628009104186777e-05, + "loss": 2.8026, + "step": 3932000 + }, + { + "epoch": 1.2224748860292802, + "grad_norm": 7.4586029052734375, + "learning_rate": 2.9625418566178664e-05, + "loss": 2.8157, + "step": 3932500 + }, + { + "epoch": 1.222630318309767, + "grad_norm": 8.026036262512207, + "learning_rate": 2.9622828028170548e-05, + "loss": 2.8601, + "step": 3933000 + }, + { + "epoch": 1.222785750590254, + "grad_norm": 8.90701961517334, + "learning_rate": 2.9620237490162435e-05, + "loss": 2.8355, + "step": 3933500 + }, + { + "epoch": 1.222941182870741, + "grad_norm": 9.988880157470703, + "learning_rate": 2.9617646952154322e-05, + "loss": 2.8122, + "step": 3934000 + }, + { + "epoch": 1.223096615151228, + "grad_norm": 9.216085433959961, + "learning_rate": 2.9615056414146202e-05, + "loss": 2.8358, + "step": 3934500 + }, + { + "epoch": 1.2232520474317148, + "grad_norm": 8.174701690673828, + "learning_rate": 2.961246587613809e-05, + "loss": 2.8208, + "step": 3935000 + }, + { + "epoch": 1.2234074797122017, + "grad_norm": 12.463690757751465, + "learning_rate": 2.9609875338129977e-05, + "loss": 2.7904, + "step": 3935500 + }, + { + "epoch": 1.2235629119926885, + "grad_norm": 12.258811950683594, + "learning_rate": 2.960728480012186e-05, + "loss": 2.8293, + "step": 3936000 + }, + { + "epoch": 1.2237183442731754, + "grad_norm": 19.39193344116211, + "learning_rate": 2.9604694262113747e-05, + "loss": 2.8155, + "step": 3936500 + }, + { + "epoch": 1.2238737765536623, + "grad_norm": 9.538865089416504, + "learning_rate": 2.9602103724105628e-05, + "loss": 2.8269, + "step": 3937000 + }, + { + "epoch": 1.2240292088341491, + "grad_norm": 9.102277755737305, + "learning_rate": 2.9599513186097515e-05, + "loss": 2.8335, + "step": 3937500 + }, + { + "epoch": 1.224184641114636, + "grad_norm": 10.385014533996582, + "learning_rate": 2.9596922648089402e-05, + "loss": 2.87, + "step": 3938000 + }, + { + "epoch": 1.2243400733951229, + "grad_norm": 8.29924201965332, + "learning_rate": 2.9594332110081286e-05, + "loss": 2.8651, + "step": 3938500 + }, + { + "epoch": 1.2244955056756097, + "grad_norm": 7.484210968017578, + "learning_rate": 2.9591741572073173e-05, + "loss": 2.7804, + "step": 3939000 + }, + { + "epoch": 1.2246509379560966, + "grad_norm": 15.226079940795898, + "learning_rate": 2.958915103406506e-05, + "loss": 2.8534, + "step": 3939500 + }, + { + "epoch": 1.2248063702365835, + "grad_norm": 10.00631046295166, + "learning_rate": 2.958656049605694e-05, + "loss": 2.8219, + "step": 3940000 + }, + { + "epoch": 1.2249618025170703, + "grad_norm": 8.94103717803955, + "learning_rate": 2.958396995804883e-05, + "loss": 2.8616, + "step": 3940500 + }, + { + "epoch": 1.2251172347975572, + "grad_norm": 13.667614936828613, + "learning_rate": 2.9581379420040718e-05, + "loss": 2.8188, + "step": 3941000 + }, + { + "epoch": 1.225272667078044, + "grad_norm": 20.310049057006836, + "learning_rate": 2.95787888820326e-05, + "loss": 2.8629, + "step": 3941500 + }, + { + "epoch": 1.225428099358531, + "grad_norm": 9.27412223815918, + "learning_rate": 2.9576198344024486e-05, + "loss": 2.8443, + "step": 3942000 + }, + { + "epoch": 1.2255835316390178, + "grad_norm": 26.800683975219727, + "learning_rate": 2.957360780601637e-05, + "loss": 2.8394, + "step": 3942500 + }, + { + "epoch": 1.2257389639195047, + "grad_norm": 11.768794059753418, + "learning_rate": 2.9571017268008257e-05, + "loss": 2.81, + "step": 3943000 + }, + { + "epoch": 1.2258943961999915, + "grad_norm": 8.154715538024902, + "learning_rate": 2.9568426730000144e-05, + "loss": 2.8783, + "step": 3943500 + }, + { + "epoch": 1.2260498284804784, + "grad_norm": 6.817788600921631, + "learning_rate": 2.9565836191992024e-05, + "loss": 2.8481, + "step": 3944000 + }, + { + "epoch": 1.2262052607609655, + "grad_norm": 11.05746078491211, + "learning_rate": 2.956324565398391e-05, + "loss": 2.811, + "step": 3944500 + }, + { + "epoch": 1.2263606930414523, + "grad_norm": 8.174612998962402, + "learning_rate": 2.95606551159758e-05, + "loss": 2.8269, + "step": 3945000 + }, + { + "epoch": 1.2265161253219392, + "grad_norm": 11.325225830078125, + "learning_rate": 2.9558064577967682e-05, + "loss": 2.8117, + "step": 3945500 + }, + { + "epoch": 1.226671557602426, + "grad_norm": 9.816883087158203, + "learning_rate": 2.955547403995957e-05, + "loss": 2.8513, + "step": 3946000 + }, + { + "epoch": 1.226826989882913, + "grad_norm": 7.684217929840088, + "learning_rate": 2.9552883501951457e-05, + "loss": 2.8627, + "step": 3946500 + }, + { + "epoch": 1.2269824221633998, + "grad_norm": 7.887423038482666, + "learning_rate": 2.9550292963943337e-05, + "loss": 2.8296, + "step": 3947000 + }, + { + "epoch": 1.2271378544438867, + "grad_norm": 9.094382286071777, + "learning_rate": 2.9547702425935224e-05, + "loss": 2.8307, + "step": 3947500 + }, + { + "epoch": 1.2272932867243735, + "grad_norm": 9.119983673095703, + "learning_rate": 2.9545111887927108e-05, + "loss": 2.8378, + "step": 3948000 + }, + { + "epoch": 1.2274487190048604, + "grad_norm": 7.987030982971191, + "learning_rate": 2.9542521349918995e-05, + "loss": 2.8271, + "step": 3948500 + }, + { + "epoch": 1.2276041512853473, + "grad_norm": 5.556906223297119, + "learning_rate": 2.9539930811910882e-05, + "loss": 2.8248, + "step": 3949000 + }, + { + "epoch": 1.2277595835658341, + "grad_norm": 10.116874694824219, + "learning_rate": 2.9537340273902763e-05, + "loss": 2.8351, + "step": 3949500 + }, + { + "epoch": 1.227915015846321, + "grad_norm": 8.564074516296387, + "learning_rate": 2.953474973589465e-05, + "loss": 2.8522, + "step": 3950000 + }, + { + "epoch": 1.2280704481268079, + "grad_norm": 8.819396018981934, + "learning_rate": 2.953215919788654e-05, + "loss": 2.8434, + "step": 3950500 + }, + { + "epoch": 1.2282258804072947, + "grad_norm": 8.287358283996582, + "learning_rate": 2.952956865987842e-05, + "loss": 2.8276, + "step": 3951000 + }, + { + "epoch": 1.2283813126877816, + "grad_norm": 12.591565132141113, + "learning_rate": 2.9526978121870308e-05, + "loss": 2.8611, + "step": 3951500 + }, + { + "epoch": 1.2285367449682685, + "grad_norm": 10.20276165008545, + "learning_rate": 2.9524387583862195e-05, + "loss": 2.8463, + "step": 3952000 + }, + { + "epoch": 1.2286921772487553, + "grad_norm": 8.623815536499023, + "learning_rate": 2.952179704585408e-05, + "loss": 2.827, + "step": 3952500 + }, + { + "epoch": 1.2288476095292422, + "grad_norm": 9.679289817810059, + "learning_rate": 2.9519206507845966e-05, + "loss": 2.8383, + "step": 3953000 + }, + { + "epoch": 1.229003041809729, + "grad_norm": 8.191078186035156, + "learning_rate": 2.9516615969837853e-05, + "loss": 2.8323, + "step": 3953500 + }, + { + "epoch": 1.229158474090216, + "grad_norm": 8.535715103149414, + "learning_rate": 2.9514025431829733e-05, + "loss": 2.8641, + "step": 3954000 + }, + { + "epoch": 1.2293139063707028, + "grad_norm": 11.202230453491211, + "learning_rate": 2.951143489382162e-05, + "loss": 2.8908, + "step": 3954500 + }, + { + "epoch": 1.2294693386511897, + "grad_norm": 8.438536643981934, + "learning_rate": 2.9508844355813504e-05, + "loss": 2.8229, + "step": 3955000 + }, + { + "epoch": 1.2296247709316765, + "grad_norm": 8.753080368041992, + "learning_rate": 2.950625381780539e-05, + "loss": 2.8267, + "step": 3955500 + }, + { + "epoch": 1.2297802032121634, + "grad_norm": 8.733397483825684, + "learning_rate": 2.950366327979728e-05, + "loss": 2.7752, + "step": 3956000 + }, + { + "epoch": 1.2299356354926503, + "grad_norm": 23.541147232055664, + "learning_rate": 2.950107274178916e-05, + "loss": 2.8301, + "step": 3956500 + }, + { + "epoch": 1.2300910677731371, + "grad_norm": 9.124503135681152, + "learning_rate": 2.9498482203781046e-05, + "loss": 2.7996, + "step": 3957000 + }, + { + "epoch": 1.230246500053624, + "grad_norm": 9.672191619873047, + "learning_rate": 2.9495891665772933e-05, + "loss": 2.8353, + "step": 3957500 + }, + { + "epoch": 1.230401932334111, + "grad_norm": 74.9887466430664, + "learning_rate": 2.9493301127764817e-05, + "loss": 2.8101, + "step": 3958000 + }, + { + "epoch": 1.230557364614598, + "grad_norm": 8.259323120117188, + "learning_rate": 2.9490710589756704e-05, + "loss": 2.8547, + "step": 3958500 + }, + { + "epoch": 1.2307127968950848, + "grad_norm": 10.98310661315918, + "learning_rate": 2.948812005174859e-05, + "loss": 2.8172, + "step": 3959000 + }, + { + "epoch": 1.2308682291755717, + "grad_norm": 9.564414024353027, + "learning_rate": 2.9485529513740472e-05, + "loss": 2.8499, + "step": 3959500 + }, + { + "epoch": 1.2310236614560586, + "grad_norm": 7.671130657196045, + "learning_rate": 2.948293897573236e-05, + "loss": 2.8626, + "step": 3960000 + }, + { + "epoch": 1.2311790937365454, + "grad_norm": 8.816036224365234, + "learning_rate": 2.9480348437724243e-05, + "loss": 2.8165, + "step": 3960500 + }, + { + "epoch": 1.2313345260170323, + "grad_norm": 8.880002975463867, + "learning_rate": 2.947775789971613e-05, + "loss": 2.878, + "step": 3961000 + }, + { + "epoch": 1.2314899582975192, + "grad_norm": 9.685297012329102, + "learning_rate": 2.9475167361708017e-05, + "loss": 2.852, + "step": 3961500 + }, + { + "epoch": 1.231645390578006, + "grad_norm": 9.0397367477417, + "learning_rate": 2.9472576823699897e-05, + "loss": 2.8396, + "step": 3962000 + }, + { + "epoch": 1.231800822858493, + "grad_norm": 15.849554061889648, + "learning_rate": 2.9469986285691788e-05, + "loss": 2.825, + "step": 3962500 + }, + { + "epoch": 1.2319562551389798, + "grad_norm": 13.582427024841309, + "learning_rate": 2.9467395747683675e-05, + "loss": 2.8413, + "step": 3963000 + }, + { + "epoch": 1.2321116874194666, + "grad_norm": 15.811665534973145, + "learning_rate": 2.9464805209675555e-05, + "loss": 2.8686, + "step": 3963500 + }, + { + "epoch": 1.2322671196999535, + "grad_norm": 8.079251289367676, + "learning_rate": 2.9462214671667443e-05, + "loss": 2.8423, + "step": 3964000 + }, + { + "epoch": 1.2324225519804404, + "grad_norm": 10.696008682250977, + "learning_rate": 2.945962413365933e-05, + "loss": 2.8458, + "step": 3964500 + }, + { + "epoch": 1.2325779842609272, + "grad_norm": 15.90536880493164, + "learning_rate": 2.9457033595651214e-05, + "loss": 2.8472, + "step": 3965000 + }, + { + "epoch": 1.232733416541414, + "grad_norm": 13.826976776123047, + "learning_rate": 2.94544430576431e-05, + "loss": 2.8772, + "step": 3965500 + }, + { + "epoch": 1.232888848821901, + "grad_norm": 9.182855606079102, + "learning_rate": 2.945185251963498e-05, + "loss": 2.7729, + "step": 3966000 + }, + { + "epoch": 1.2330442811023878, + "grad_norm": 8.77829360961914, + "learning_rate": 2.9449261981626868e-05, + "loss": 2.8182, + "step": 3966500 + }, + { + "epoch": 1.2331997133828747, + "grad_norm": 13.8753080368042, + "learning_rate": 2.9446671443618755e-05, + "loss": 2.8229, + "step": 3967000 + }, + { + "epoch": 1.2333551456633616, + "grad_norm": 14.742850303649902, + "learning_rate": 2.944408090561064e-05, + "loss": 2.8484, + "step": 3967500 + }, + { + "epoch": 1.2335105779438484, + "grad_norm": 12.873085021972656, + "learning_rate": 2.9441490367602526e-05, + "loss": 2.9028, + "step": 3968000 + }, + { + "epoch": 1.2336660102243355, + "grad_norm": 8.524004936218262, + "learning_rate": 2.9438899829594413e-05, + "loss": 2.8622, + "step": 3968500 + }, + { + "epoch": 1.2338214425048224, + "grad_norm": 10.76765251159668, + "learning_rate": 2.9436309291586294e-05, + "loss": 2.8481, + "step": 3969000 + }, + { + "epoch": 1.2339768747853093, + "grad_norm": 10.010831832885742, + "learning_rate": 2.943371875357818e-05, + "loss": 2.7814, + "step": 3969500 + }, + { + "epoch": 1.2341323070657961, + "grad_norm": 10.789239883422852, + "learning_rate": 2.9431128215570068e-05, + "loss": 2.8676, + "step": 3970000 + }, + { + "epoch": 1.234287739346283, + "grad_norm": 8.747909545898438, + "learning_rate": 2.9428537677561952e-05, + "loss": 2.8173, + "step": 3970500 + }, + { + "epoch": 1.2344431716267699, + "grad_norm": 25.662118911743164, + "learning_rate": 2.942594713955384e-05, + "loss": 2.7971, + "step": 3971000 + }, + { + "epoch": 1.2345986039072567, + "grad_norm": 9.307735443115234, + "learning_rate": 2.9423356601545726e-05, + "loss": 2.827, + "step": 3971500 + }, + { + "epoch": 1.2347540361877436, + "grad_norm": 15.424802780151367, + "learning_rate": 2.9420766063537607e-05, + "loss": 2.8517, + "step": 3972000 + }, + { + "epoch": 1.2349094684682305, + "grad_norm": 10.487433433532715, + "learning_rate": 2.9418175525529497e-05, + "loss": 2.8463, + "step": 3972500 + }, + { + "epoch": 1.2350649007487173, + "grad_norm": 15.164679527282715, + "learning_rate": 2.9415584987521377e-05, + "loss": 2.8447, + "step": 3973000 + }, + { + "epoch": 1.2352203330292042, + "grad_norm": 32.53236770629883, + "learning_rate": 2.9412994449513265e-05, + "loss": 2.8091, + "step": 3973500 + }, + { + "epoch": 1.235375765309691, + "grad_norm": 8.354549407958984, + "learning_rate": 2.9410403911505152e-05, + "loss": 2.8473, + "step": 3974000 + }, + { + "epoch": 1.235531197590178, + "grad_norm": 9.717977523803711, + "learning_rate": 2.9407813373497036e-05, + "loss": 2.8015, + "step": 3974500 + }, + { + "epoch": 1.2356866298706648, + "grad_norm": 9.12913990020752, + "learning_rate": 2.9405222835488923e-05, + "loss": 2.8517, + "step": 3975000 + }, + { + "epoch": 1.2358420621511517, + "grad_norm": 8.433611869812012, + "learning_rate": 2.940263229748081e-05, + "loss": 2.8345, + "step": 3975500 + }, + { + "epoch": 1.2359974944316385, + "grad_norm": 9.766407012939453, + "learning_rate": 2.940004175947269e-05, + "loss": 2.7958, + "step": 3976000 + }, + { + "epoch": 1.2361529267121254, + "grad_norm": 7.5537543296813965, + "learning_rate": 2.9397451221464577e-05, + "loss": 2.8389, + "step": 3976500 + }, + { + "epoch": 1.2363083589926123, + "grad_norm": 9.725394248962402, + "learning_rate": 2.9394860683456465e-05, + "loss": 2.8441, + "step": 3977000 + }, + { + "epoch": 1.2364637912730991, + "grad_norm": 10.965370178222656, + "learning_rate": 2.939227014544835e-05, + "loss": 2.7748, + "step": 3977500 + }, + { + "epoch": 1.236619223553586, + "grad_norm": 10.706387519836426, + "learning_rate": 2.9389679607440235e-05, + "loss": 2.8676, + "step": 3978000 + }, + { + "epoch": 1.2367746558340729, + "grad_norm": 7.974374771118164, + "learning_rate": 2.9387089069432116e-05, + "loss": 2.8145, + "step": 3978500 + }, + { + "epoch": 1.2369300881145597, + "grad_norm": 8.680874824523926, + "learning_rate": 2.9384498531424003e-05, + "loss": 2.8435, + "step": 3979000 + }, + { + "epoch": 1.2370855203950466, + "grad_norm": 9.774932861328125, + "learning_rate": 2.938190799341589e-05, + "loss": 2.8113, + "step": 3979500 + }, + { + "epoch": 1.2372409526755335, + "grad_norm": 9.73847770690918, + "learning_rate": 2.9379317455407774e-05, + "loss": 2.8518, + "step": 3980000 + }, + { + "epoch": 1.2373963849560203, + "grad_norm": 7.611592769622803, + "learning_rate": 2.937672691739966e-05, + "loss": 2.8287, + "step": 3980500 + }, + { + "epoch": 1.2375518172365072, + "grad_norm": 11.729168891906738, + "learning_rate": 2.9374136379391548e-05, + "loss": 2.8363, + "step": 3981000 + }, + { + "epoch": 1.237707249516994, + "grad_norm": 8.002240180969238, + "learning_rate": 2.937154584138343e-05, + "loss": 2.8404, + "step": 3981500 + }, + { + "epoch": 1.2378626817974812, + "grad_norm": 7.904122829437256, + "learning_rate": 2.9368955303375316e-05, + "loss": 2.8849, + "step": 3982000 + }, + { + "epoch": 1.238018114077968, + "grad_norm": 8.657675743103027, + "learning_rate": 2.9366364765367206e-05, + "loss": 2.8624, + "step": 3982500 + }, + { + "epoch": 1.238173546358455, + "grad_norm": 57.1417121887207, + "learning_rate": 2.9363774227359087e-05, + "loss": 2.8366, + "step": 3983000 + }, + { + "epoch": 1.2383289786389418, + "grad_norm": 8.88260269165039, + "learning_rate": 2.9361183689350974e-05, + "loss": 2.8658, + "step": 3983500 + }, + { + "epoch": 1.2384844109194286, + "grad_norm": 10.20126724243164, + "learning_rate": 2.935859315134286e-05, + "loss": 2.8278, + "step": 3984000 + }, + { + "epoch": 1.2386398431999155, + "grad_norm": 11.082656860351562, + "learning_rate": 2.9356002613334745e-05, + "loss": 2.784, + "step": 3984500 + }, + { + "epoch": 1.2387952754804024, + "grad_norm": 13.875201225280762, + "learning_rate": 2.9353412075326632e-05, + "loss": 2.8537, + "step": 3985000 + }, + { + "epoch": 1.2389507077608892, + "grad_norm": 7.801939010620117, + "learning_rate": 2.9350821537318512e-05, + "loss": 2.8296, + "step": 3985500 + }, + { + "epoch": 1.239106140041376, + "grad_norm": 20.82878875732422, + "learning_rate": 2.93482309993104e-05, + "loss": 2.9137, + "step": 3986000 + }, + { + "epoch": 1.239261572321863, + "grad_norm": 8.765009880065918, + "learning_rate": 2.9345640461302287e-05, + "loss": 2.8623, + "step": 3986500 + }, + { + "epoch": 1.2394170046023498, + "grad_norm": 8.146303176879883, + "learning_rate": 2.934304992329417e-05, + "loss": 2.8396, + "step": 3987000 + }, + { + "epoch": 1.2395724368828367, + "grad_norm": 8.396001815795898, + "learning_rate": 2.9340459385286058e-05, + "loss": 2.8033, + "step": 3987500 + }, + { + "epoch": 1.2397278691633236, + "grad_norm": 9.230238914489746, + "learning_rate": 2.9337868847277945e-05, + "loss": 2.8557, + "step": 3988000 + }, + { + "epoch": 1.2398833014438104, + "grad_norm": 8.301993370056152, + "learning_rate": 2.9335278309269825e-05, + "loss": 2.8085, + "step": 3988500 + }, + { + "epoch": 1.2400387337242973, + "grad_norm": 8.719982147216797, + "learning_rate": 2.9332687771261712e-05, + "loss": 2.866, + "step": 3989000 + }, + { + "epoch": 1.2401941660047842, + "grad_norm": 10.012482643127441, + "learning_rate": 2.93300972332536e-05, + "loss": 2.8723, + "step": 3989500 + }, + { + "epoch": 1.240349598285271, + "grad_norm": 8.441944122314453, + "learning_rate": 2.9327506695245483e-05, + "loss": 2.8546, + "step": 3990000 + }, + { + "epoch": 1.240505030565758, + "grad_norm": 13.091477394104004, + "learning_rate": 2.932491615723737e-05, + "loss": 2.7894, + "step": 3990500 + }, + { + "epoch": 1.2406604628462448, + "grad_norm": 10.076692581176758, + "learning_rate": 2.932232561922925e-05, + "loss": 2.8621, + "step": 3991000 + }, + { + "epoch": 1.2408158951267316, + "grad_norm": 7.655643939971924, + "learning_rate": 2.9319735081221138e-05, + "loss": 2.8303, + "step": 3991500 + }, + { + "epoch": 1.2409713274072185, + "grad_norm": 8.487726211547852, + "learning_rate": 2.9317144543213025e-05, + "loss": 2.8206, + "step": 3992000 + }, + { + "epoch": 1.2411267596877056, + "grad_norm": 9.09811782836914, + "learning_rate": 2.931455400520491e-05, + "loss": 2.8345, + "step": 3992500 + }, + { + "epoch": 1.2412821919681924, + "grad_norm": 9.14154052734375, + "learning_rate": 2.9311963467196796e-05, + "loss": 2.8092, + "step": 3993000 + }, + { + "epoch": 1.2414376242486793, + "grad_norm": 10.015750885009766, + "learning_rate": 2.9309372929188683e-05, + "loss": 2.8495, + "step": 3993500 + }, + { + "epoch": 1.2415930565291662, + "grad_norm": 8.285758018493652, + "learning_rate": 2.9306782391180563e-05, + "loss": 2.808, + "step": 3994000 + }, + { + "epoch": 1.241748488809653, + "grad_norm": 8.117934226989746, + "learning_rate": 2.9304191853172454e-05, + "loss": 2.8193, + "step": 3994500 + }, + { + "epoch": 1.24190392109014, + "grad_norm": 8.990256309509277, + "learning_rate": 2.930160131516434e-05, + "loss": 2.8311, + "step": 3995000 + }, + { + "epoch": 1.2420593533706268, + "grad_norm": 9.177356719970703, + "learning_rate": 2.929901077715622e-05, + "loss": 2.7939, + "step": 3995500 + }, + { + "epoch": 1.2422147856511137, + "grad_norm": 11.20091724395752, + "learning_rate": 2.929642023914811e-05, + "loss": 2.8602, + "step": 3996000 + }, + { + "epoch": 1.2423702179316005, + "grad_norm": 8.634817123413086, + "learning_rate": 2.9293829701139992e-05, + "loss": 2.854, + "step": 3996500 + }, + { + "epoch": 1.2425256502120874, + "grad_norm": 8.067070007324219, + "learning_rate": 2.929123916313188e-05, + "loss": 2.8212, + "step": 3997000 + }, + { + "epoch": 1.2426810824925743, + "grad_norm": 8.766178131103516, + "learning_rate": 2.9288648625123767e-05, + "loss": 2.8404, + "step": 3997500 + }, + { + "epoch": 1.2428365147730611, + "grad_norm": 21.044788360595703, + "learning_rate": 2.9286058087115647e-05, + "loss": 2.8055, + "step": 3998000 + }, + { + "epoch": 1.242991947053548, + "grad_norm": 9.330278396606445, + "learning_rate": 2.9283467549107534e-05, + "loss": 2.7859, + "step": 3998500 + }, + { + "epoch": 1.2431473793340349, + "grad_norm": 10.624930381774902, + "learning_rate": 2.928087701109942e-05, + "loss": 2.8418, + "step": 3999000 + }, + { + "epoch": 1.2433028116145217, + "grad_norm": 7.85241174697876, + "learning_rate": 2.9278286473091305e-05, + "loss": 2.868, + "step": 3999500 + }, + { + "epoch": 1.2434582438950086, + "grad_norm": 14.822848320007324, + "learning_rate": 2.9275695935083192e-05, + "loss": 2.8368, + "step": 4000000 + }, + { + "epoch": 1.2436136761754955, + "grad_norm": 8.958559036254883, + "learning_rate": 2.927310539707508e-05, + "loss": 2.8811, + "step": 4000500 + }, + { + "epoch": 1.2437691084559823, + "grad_norm": 10.309440612792969, + "learning_rate": 2.927051485906696e-05, + "loss": 2.845, + "step": 4001000 + }, + { + "epoch": 1.2439245407364692, + "grad_norm": 8.676634788513184, + "learning_rate": 2.9267924321058847e-05, + "loss": 2.8768, + "step": 4001500 + }, + { + "epoch": 1.244079973016956, + "grad_norm": 6.513299465179443, + "learning_rate": 2.9265333783050734e-05, + "loss": 2.8247, + "step": 4002000 + }, + { + "epoch": 1.244235405297443, + "grad_norm": 8.844549179077148, + "learning_rate": 2.9262743245042618e-05, + "loss": 2.8499, + "step": 4002500 + }, + { + "epoch": 1.2443908375779298, + "grad_norm": 7.762399673461914, + "learning_rate": 2.9260152707034505e-05, + "loss": 2.8237, + "step": 4003000 + }, + { + "epoch": 1.2445462698584167, + "grad_norm": 8.65451431274414, + "learning_rate": 2.9257562169026385e-05, + "loss": 2.8329, + "step": 4003500 + }, + { + "epoch": 1.2447017021389035, + "grad_norm": 9.372855186462402, + "learning_rate": 2.9254971631018273e-05, + "loss": 2.8297, + "step": 4004000 + }, + { + "epoch": 1.2448571344193904, + "grad_norm": 9.604012489318848, + "learning_rate": 2.9252381093010163e-05, + "loss": 2.8132, + "step": 4004500 + }, + { + "epoch": 1.2450125666998773, + "grad_norm": 29.226327896118164, + "learning_rate": 2.9249790555002043e-05, + "loss": 2.8284, + "step": 4005000 + }, + { + "epoch": 1.2451679989803641, + "grad_norm": 10.089460372924805, + "learning_rate": 2.924720001699393e-05, + "loss": 2.8326, + "step": 4005500 + }, + { + "epoch": 1.2453234312608512, + "grad_norm": 10.124815940856934, + "learning_rate": 2.9244609478985818e-05, + "loss": 2.8322, + "step": 4006000 + }, + { + "epoch": 1.245478863541338, + "grad_norm": 12.324067115783691, + "learning_rate": 2.92420189409777e-05, + "loss": 2.8266, + "step": 4006500 + }, + { + "epoch": 1.245634295821825, + "grad_norm": 10.282017707824707, + "learning_rate": 2.923942840296959e-05, + "loss": 2.8582, + "step": 4007000 + }, + { + "epoch": 1.2457897281023118, + "grad_norm": 18.05136489868164, + "learning_rate": 2.9236837864961476e-05, + "loss": 2.8236, + "step": 4007500 + }, + { + "epoch": 1.2459451603827987, + "grad_norm": 22.844280242919922, + "learning_rate": 2.9234247326953356e-05, + "loss": 2.8284, + "step": 4008000 + }, + { + "epoch": 1.2461005926632855, + "grad_norm": 8.779093742370605, + "learning_rate": 2.9231656788945243e-05, + "loss": 2.8437, + "step": 4008500 + }, + { + "epoch": 1.2462560249437724, + "grad_norm": 9.531928062438965, + "learning_rate": 2.9229066250937127e-05, + "loss": 2.8631, + "step": 4009000 + }, + { + "epoch": 1.2464114572242593, + "grad_norm": 8.350166320800781, + "learning_rate": 2.9226475712929014e-05, + "loss": 2.8353, + "step": 4009500 + }, + { + "epoch": 1.2465668895047461, + "grad_norm": 7.615129470825195, + "learning_rate": 2.92238851749209e-05, + "loss": 2.7778, + "step": 4010000 + }, + { + "epoch": 1.246722321785233, + "grad_norm": 9.624025344848633, + "learning_rate": 2.9221294636912782e-05, + "loss": 2.8388, + "step": 4010500 + }, + { + "epoch": 1.2468777540657199, + "grad_norm": 10.116622924804688, + "learning_rate": 2.921870409890467e-05, + "loss": 2.8025, + "step": 4011000 + }, + { + "epoch": 1.2470331863462067, + "grad_norm": 7.301684379577637, + "learning_rate": 2.9216113560896556e-05, + "loss": 2.8558, + "step": 4011500 + }, + { + "epoch": 1.2471886186266936, + "grad_norm": 12.368507385253906, + "learning_rate": 2.921352302288844e-05, + "loss": 2.8348, + "step": 4012000 + }, + { + "epoch": 1.2473440509071805, + "grad_norm": 8.842255592346191, + "learning_rate": 2.9210932484880327e-05, + "loss": 2.8131, + "step": 4012500 + }, + { + "epoch": 1.2474994831876673, + "grad_norm": 13.656980514526367, + "learning_rate": 2.9208341946872214e-05, + "loss": 2.8384, + "step": 4013000 + }, + { + "epoch": 1.2476549154681542, + "grad_norm": 10.787422180175781, + "learning_rate": 2.9205751408864095e-05, + "loss": 2.8527, + "step": 4013500 + }, + { + "epoch": 1.247810347748641, + "grad_norm": 6.852137088775635, + "learning_rate": 2.9203160870855982e-05, + "loss": 2.8336, + "step": 4014000 + }, + { + "epoch": 1.247965780029128, + "grad_norm": 9.462738990783691, + "learning_rate": 2.9200570332847866e-05, + "loss": 2.8397, + "step": 4014500 + }, + { + "epoch": 1.2481212123096148, + "grad_norm": 8.790613174438477, + "learning_rate": 2.9197979794839753e-05, + "loss": 2.8521, + "step": 4015000 + }, + { + "epoch": 1.2482766445901017, + "grad_norm": 7.022427558898926, + "learning_rate": 2.919538925683164e-05, + "loss": 2.8382, + "step": 4015500 + }, + { + "epoch": 1.2484320768705885, + "grad_norm": 8.611903190612793, + "learning_rate": 2.9192798718823524e-05, + "loss": 2.8501, + "step": 4016000 + }, + { + "epoch": 1.2485875091510756, + "grad_norm": 9.815628051757812, + "learning_rate": 2.919020818081541e-05, + "loss": 2.8193, + "step": 4016500 + }, + { + "epoch": 1.2487429414315625, + "grad_norm": 9.278103828430176, + "learning_rate": 2.9187617642807298e-05, + "loss": 2.8115, + "step": 4017000 + }, + { + "epoch": 1.2488983737120494, + "grad_norm": 10.338245391845703, + "learning_rate": 2.9185027104799178e-05, + "loss": 2.8195, + "step": 4017500 + }, + { + "epoch": 1.2490538059925362, + "grad_norm": 9.071369171142578, + "learning_rate": 2.9182436566791065e-05, + "loss": 2.8377, + "step": 4018000 + }, + { + "epoch": 1.249209238273023, + "grad_norm": 9.456426620483398, + "learning_rate": 2.9179846028782953e-05, + "loss": 2.8733, + "step": 4018500 + }, + { + "epoch": 1.24936467055351, + "grad_norm": 16.630029678344727, + "learning_rate": 2.9177255490774836e-05, + "loss": 2.8326, + "step": 4019000 + }, + { + "epoch": 1.2495201028339968, + "grad_norm": 13.952574729919434, + "learning_rate": 2.9174664952766724e-05, + "loss": 2.8029, + "step": 4019500 + }, + { + "epoch": 1.2496755351144837, + "grad_norm": 8.048800468444824, + "learning_rate": 2.917207441475861e-05, + "loss": 2.8073, + "step": 4020000 + }, + { + "epoch": 1.2498309673949706, + "grad_norm": 13.727659225463867, + "learning_rate": 2.916948387675049e-05, + "loss": 2.8328, + "step": 4020500 + }, + { + "epoch": 1.2499863996754574, + "grad_norm": 9.523242950439453, + "learning_rate": 2.9166893338742378e-05, + "loss": 2.8354, + "step": 4021000 + }, + { + "epoch": 1.2501418319559443, + "grad_norm": 11.807839393615723, + "learning_rate": 2.9164302800734262e-05, + "loss": 2.838, + "step": 4021500 + }, + { + "epoch": 1.2502972642364312, + "grad_norm": 17.041439056396484, + "learning_rate": 2.916171226272615e-05, + "loss": 2.8823, + "step": 4022000 + }, + { + "epoch": 1.250452696516918, + "grad_norm": 8.388256072998047, + "learning_rate": 2.9159121724718036e-05, + "loss": 2.8048, + "step": 4022500 + }, + { + "epoch": 1.250608128797405, + "grad_norm": 9.687066078186035, + "learning_rate": 2.9156531186709917e-05, + "loss": 2.8089, + "step": 4023000 + }, + { + "epoch": 1.2507635610778918, + "grad_norm": 8.965018272399902, + "learning_rate": 2.9153940648701804e-05, + "loss": 2.7964, + "step": 4023500 + }, + { + "epoch": 1.2509189933583786, + "grad_norm": 12.643733978271484, + "learning_rate": 2.915135011069369e-05, + "loss": 2.7916, + "step": 4024000 + }, + { + "epoch": 1.2510744256388655, + "grad_norm": 8.068138122558594, + "learning_rate": 2.9148759572685575e-05, + "loss": 2.8528, + "step": 4024500 + }, + { + "epoch": 1.2512298579193524, + "grad_norm": 8.971295356750488, + "learning_rate": 2.9146169034677462e-05, + "loss": 2.8837, + "step": 4025000 + }, + { + "epoch": 1.2513852901998392, + "grad_norm": 11.229735374450684, + "learning_rate": 2.914357849666935e-05, + "loss": 2.8326, + "step": 4025500 + }, + { + "epoch": 1.251540722480326, + "grad_norm": 14.384347915649414, + "learning_rate": 2.9140987958661233e-05, + "loss": 2.838, + "step": 4026000 + }, + { + "epoch": 1.251696154760813, + "grad_norm": 9.846345901489258, + "learning_rate": 2.913839742065312e-05, + "loss": 2.7888, + "step": 4026500 + }, + { + "epoch": 1.2518515870412998, + "grad_norm": 11.29188060760498, + "learning_rate": 2.9135806882645e-05, + "loss": 2.8656, + "step": 4027000 + }, + { + "epoch": 1.2520070193217867, + "grad_norm": 8.295489311218262, + "learning_rate": 2.9133216344636887e-05, + "loss": 2.8257, + "step": 4027500 + }, + { + "epoch": 1.2521624516022736, + "grad_norm": 9.14037036895752, + "learning_rate": 2.9130625806628775e-05, + "loss": 2.8309, + "step": 4028000 + }, + { + "epoch": 1.2523178838827604, + "grad_norm": 11.318441390991211, + "learning_rate": 2.912803526862066e-05, + "loss": 2.8605, + "step": 4028500 + }, + { + "epoch": 1.2524733161632473, + "grad_norm": 8.460466384887695, + "learning_rate": 2.9125444730612546e-05, + "loss": 2.8134, + "step": 4029000 + }, + { + "epoch": 1.2526287484437342, + "grad_norm": 20.626447677612305, + "learning_rate": 2.9122854192604433e-05, + "loss": 2.8124, + "step": 4029500 + }, + { + "epoch": 1.252784180724221, + "grad_norm": 19.387727737426758, + "learning_rate": 2.9120263654596313e-05, + "loss": 2.823, + "step": 4030000 + }, + { + "epoch": 1.252939613004708, + "grad_norm": 19.38434600830078, + "learning_rate": 2.91176731165882e-05, + "loss": 2.8485, + "step": 4030500 + }, + { + "epoch": 1.253095045285195, + "grad_norm": 10.686087608337402, + "learning_rate": 2.9115082578580087e-05, + "loss": 2.8208, + "step": 4031000 + }, + { + "epoch": 1.2532504775656819, + "grad_norm": 10.140777587890625, + "learning_rate": 2.911249204057197e-05, + "loss": 2.7944, + "step": 4031500 + }, + { + "epoch": 1.2534059098461687, + "grad_norm": 9.9536771774292, + "learning_rate": 2.9109901502563858e-05, + "loss": 2.825, + "step": 4032000 + }, + { + "epoch": 1.2535613421266556, + "grad_norm": 8.443273544311523, + "learning_rate": 2.910731096455574e-05, + "loss": 2.8564, + "step": 4032500 + }, + { + "epoch": 1.2537167744071425, + "grad_norm": 9.479632377624512, + "learning_rate": 2.9104720426547626e-05, + "loss": 2.8044, + "step": 4033000 + }, + { + "epoch": 1.2538722066876293, + "grad_norm": 8.451627731323242, + "learning_rate": 2.9102129888539513e-05, + "loss": 2.8406, + "step": 4033500 + }, + { + "epoch": 1.2540276389681162, + "grad_norm": 8.418844223022461, + "learning_rate": 2.9099539350531397e-05, + "loss": 2.8312, + "step": 4034000 + }, + { + "epoch": 1.254183071248603, + "grad_norm": 7.40387487411499, + "learning_rate": 2.9096948812523284e-05, + "loss": 2.8051, + "step": 4034500 + }, + { + "epoch": 1.25433850352909, + "grad_norm": 10.178009986877441, + "learning_rate": 2.909435827451517e-05, + "loss": 2.7853, + "step": 4035000 + }, + { + "epoch": 1.2544939358095768, + "grad_norm": 8.971587181091309, + "learning_rate": 2.909176773650705e-05, + "loss": 2.8489, + "step": 4035500 + }, + { + "epoch": 1.2546493680900637, + "grad_norm": 12.96993350982666, + "learning_rate": 2.9089177198498942e-05, + "loss": 2.8582, + "step": 4036000 + }, + { + "epoch": 1.2548048003705505, + "grad_norm": 20.854616165161133, + "learning_rate": 2.908658666049083e-05, + "loss": 2.8518, + "step": 4036500 + }, + { + "epoch": 1.2549602326510374, + "grad_norm": 8.764952659606934, + "learning_rate": 2.908399612248271e-05, + "loss": 2.8419, + "step": 4037000 + }, + { + "epoch": 1.2551156649315243, + "grad_norm": 6.54217004776001, + "learning_rate": 2.9081405584474597e-05, + "loss": 2.8295, + "step": 4037500 + }, + { + "epoch": 1.2552710972120111, + "grad_norm": 7.3802690505981445, + "learning_rate": 2.9078815046466484e-05, + "loss": 2.8104, + "step": 4038000 + }, + { + "epoch": 1.255426529492498, + "grad_norm": 19.23668670654297, + "learning_rate": 2.9076224508458368e-05, + "loss": 2.8796, + "step": 4038500 + }, + { + "epoch": 1.2555819617729849, + "grad_norm": 8.572202682495117, + "learning_rate": 2.9073633970450255e-05, + "loss": 2.8378, + "step": 4039000 + }, + { + "epoch": 1.2557373940534717, + "grad_norm": 8.805363655090332, + "learning_rate": 2.9071043432442135e-05, + "loss": 2.8367, + "step": 4039500 + }, + { + "epoch": 1.2558928263339588, + "grad_norm": 10.390484809875488, + "learning_rate": 2.9068452894434022e-05, + "loss": 2.8246, + "step": 4040000 + }, + { + "epoch": 1.2560482586144457, + "grad_norm": 8.034857749938965, + "learning_rate": 2.906586235642591e-05, + "loss": 2.7811, + "step": 4040500 + }, + { + "epoch": 1.2562036908949326, + "grad_norm": 7.740722179412842, + "learning_rate": 2.9063271818417793e-05, + "loss": 2.8024, + "step": 4041000 + }, + { + "epoch": 1.2563591231754194, + "grad_norm": 8.010037422180176, + "learning_rate": 2.906068128040968e-05, + "loss": 2.8078, + "step": 4041500 + }, + { + "epoch": 1.2565145554559063, + "grad_norm": 9.228168487548828, + "learning_rate": 2.9058090742401567e-05, + "loss": 2.8311, + "step": 4042000 + }, + { + "epoch": 1.2566699877363932, + "grad_norm": 8.942778587341309, + "learning_rate": 2.9055500204393448e-05, + "loss": 2.8207, + "step": 4042500 + }, + { + "epoch": 1.25682542001688, + "grad_norm": 8.730801582336426, + "learning_rate": 2.9052909666385335e-05, + "loss": 2.8233, + "step": 4043000 + }, + { + "epoch": 1.256980852297367, + "grad_norm": 7.678500652313232, + "learning_rate": 2.9050319128377222e-05, + "loss": 2.8318, + "step": 4043500 + }, + { + "epoch": 1.2571362845778538, + "grad_norm": 9.26722526550293, + "learning_rate": 2.9047728590369106e-05, + "loss": 2.8761, + "step": 4044000 + }, + { + "epoch": 1.2572917168583406, + "grad_norm": 10.692769050598145, + "learning_rate": 2.9045138052360993e-05, + "loss": 2.8219, + "step": 4044500 + }, + { + "epoch": 1.2574471491388275, + "grad_norm": 7.387726783752441, + "learning_rate": 2.9042547514352873e-05, + "loss": 2.8102, + "step": 4045000 + }, + { + "epoch": 1.2576025814193144, + "grad_norm": 7.725583076477051, + "learning_rate": 2.903995697634476e-05, + "loss": 2.8249, + "step": 4045500 + }, + { + "epoch": 1.2577580136998012, + "grad_norm": 13.815380096435547, + "learning_rate": 2.903736643833665e-05, + "loss": 2.8155, + "step": 4046000 + }, + { + "epoch": 1.257913445980288, + "grad_norm": 8.156950950622559, + "learning_rate": 2.903477590032853e-05, + "loss": 2.8028, + "step": 4046500 + }, + { + "epoch": 1.258068878260775, + "grad_norm": 8.78976821899414, + "learning_rate": 2.903218536232042e-05, + "loss": 2.8403, + "step": 4047000 + }, + { + "epoch": 1.2582243105412618, + "grad_norm": 9.153359413146973, + "learning_rate": 2.9029594824312306e-05, + "loss": 2.8532, + "step": 4047500 + }, + { + "epoch": 1.2583797428217487, + "grad_norm": 8.40832233428955, + "learning_rate": 2.902700428630419e-05, + "loss": 2.8509, + "step": 4048000 + }, + { + "epoch": 1.2585351751022356, + "grad_norm": 8.345455169677734, + "learning_rate": 2.9024413748296077e-05, + "loss": 2.8321, + "step": 4048500 + }, + { + "epoch": 1.2586906073827224, + "grad_norm": 8.499229431152344, + "learning_rate": 2.9021823210287964e-05, + "loss": 2.8264, + "step": 4049000 + }, + { + "epoch": 1.2588460396632093, + "grad_norm": 13.057723045349121, + "learning_rate": 2.9019232672279844e-05, + "loss": 2.8315, + "step": 4049500 + }, + { + "epoch": 1.2590014719436962, + "grad_norm": 9.96527099609375, + "learning_rate": 2.901664213427173e-05, + "loss": 2.8716, + "step": 4050000 + }, + { + "epoch": 1.259156904224183, + "grad_norm": 7.992790699005127, + "learning_rate": 2.9014051596263615e-05, + "loss": 2.8504, + "step": 4050500 + }, + { + "epoch": 1.25931233650467, + "grad_norm": 18.873769760131836, + "learning_rate": 2.9011461058255502e-05, + "loss": 2.7886, + "step": 4051000 + }, + { + "epoch": 1.2594677687851568, + "grad_norm": 9.1380615234375, + "learning_rate": 2.900887052024739e-05, + "loss": 2.8, + "step": 4051500 + }, + { + "epoch": 1.2596232010656436, + "grad_norm": 18.681400299072266, + "learning_rate": 2.900627998223927e-05, + "loss": 2.8164, + "step": 4052000 + }, + { + "epoch": 1.2597786333461305, + "grad_norm": 13.836726188659668, + "learning_rate": 2.9003689444231157e-05, + "loss": 2.8489, + "step": 4052500 + }, + { + "epoch": 1.2599340656266174, + "grad_norm": 14.314898490905762, + "learning_rate": 2.9001098906223044e-05, + "loss": 2.779, + "step": 4053000 + }, + { + "epoch": 1.2600894979071042, + "grad_norm": 10.997724533081055, + "learning_rate": 2.8998508368214928e-05, + "loss": 2.8231, + "step": 4053500 + }, + { + "epoch": 1.260244930187591, + "grad_norm": 11.05537223815918, + "learning_rate": 2.8995917830206815e-05, + "loss": 2.8565, + "step": 4054000 + }, + { + "epoch": 1.260400362468078, + "grad_norm": 9.46123218536377, + "learning_rate": 2.8993327292198702e-05, + "loss": 2.8287, + "step": 4054500 + }, + { + "epoch": 1.260555794748565, + "grad_norm": 9.26337718963623, + "learning_rate": 2.8990736754190583e-05, + "loss": 2.8608, + "step": 4055000 + }, + { + "epoch": 1.260711227029052, + "grad_norm": 9.416229248046875, + "learning_rate": 2.898814621618247e-05, + "loss": 2.8381, + "step": 4055500 + }, + { + "epoch": 1.2608666593095388, + "grad_norm": 7.08278751373291, + "learning_rate": 2.898555567817436e-05, + "loss": 2.8416, + "step": 4056000 + }, + { + "epoch": 1.2610220915900257, + "grad_norm": 9.50455093383789, + "learning_rate": 2.898296514016624e-05, + "loss": 2.8314, + "step": 4056500 + }, + { + "epoch": 1.2611775238705125, + "grad_norm": 10.723848342895508, + "learning_rate": 2.8980374602158128e-05, + "loss": 2.8408, + "step": 4057000 + }, + { + "epoch": 1.2613329561509994, + "grad_norm": 17.294517517089844, + "learning_rate": 2.8977784064150008e-05, + "loss": 2.8404, + "step": 4057500 + }, + { + "epoch": 1.2614883884314863, + "grad_norm": 9.364919662475586, + "learning_rate": 2.89751935261419e-05, + "loss": 2.839, + "step": 4058000 + }, + { + "epoch": 1.2616438207119731, + "grad_norm": 18.648338317871094, + "learning_rate": 2.8972602988133786e-05, + "loss": 2.7811, + "step": 4058500 + }, + { + "epoch": 1.26179925299246, + "grad_norm": 9.522272109985352, + "learning_rate": 2.8970012450125666e-05, + "loss": 2.8167, + "step": 4059000 + }, + { + "epoch": 1.2619546852729469, + "grad_norm": 7.374168395996094, + "learning_rate": 2.8967421912117553e-05, + "loss": 2.7841, + "step": 4059500 + }, + { + "epoch": 1.2621101175534337, + "grad_norm": 9.134795188903809, + "learning_rate": 2.896483137410944e-05, + "loss": 2.7931, + "step": 4060000 + }, + { + "epoch": 1.2622655498339206, + "grad_norm": 12.489049911499023, + "learning_rate": 2.8962240836101324e-05, + "loss": 2.7939, + "step": 4060500 + }, + { + "epoch": 1.2624209821144075, + "grad_norm": 8.481751441955566, + "learning_rate": 2.895965029809321e-05, + "loss": 2.8454, + "step": 4061000 + }, + { + "epoch": 1.2625764143948943, + "grad_norm": 8.661394119262695, + "learning_rate": 2.89570597600851e-05, + "loss": 2.8075, + "step": 4061500 + }, + { + "epoch": 1.2627318466753812, + "grad_norm": 13.011517524719238, + "learning_rate": 2.895446922207698e-05, + "loss": 2.8403, + "step": 4062000 + }, + { + "epoch": 1.262887278955868, + "grad_norm": 9.714963912963867, + "learning_rate": 2.8951878684068866e-05, + "loss": 2.8137, + "step": 4062500 + }, + { + "epoch": 1.263042711236355, + "grad_norm": 7.725822925567627, + "learning_rate": 2.894928814606075e-05, + "loss": 2.8264, + "step": 4063000 + }, + { + "epoch": 1.2631981435168418, + "grad_norm": 11.323038101196289, + "learning_rate": 2.8946697608052637e-05, + "loss": 2.7955, + "step": 4063500 + }, + { + "epoch": 1.2633535757973289, + "grad_norm": 9.654956817626953, + "learning_rate": 2.8944107070044524e-05, + "loss": 2.7792, + "step": 4064000 + }, + { + "epoch": 1.2635090080778157, + "grad_norm": 9.00074577331543, + "learning_rate": 2.8941516532036405e-05, + "loss": 2.7944, + "step": 4064500 + }, + { + "epoch": 1.2636644403583026, + "grad_norm": 10.746560096740723, + "learning_rate": 2.8938925994028292e-05, + "loss": 2.8226, + "step": 4065000 + }, + { + "epoch": 1.2638198726387895, + "grad_norm": 13.265795707702637, + "learning_rate": 2.893633545602018e-05, + "loss": 2.788, + "step": 4065500 + }, + { + "epoch": 1.2639753049192763, + "grad_norm": 9.068229675292969, + "learning_rate": 2.8933744918012063e-05, + "loss": 2.8405, + "step": 4066000 + }, + { + "epoch": 1.2641307371997632, + "grad_norm": 9.084473609924316, + "learning_rate": 2.893115438000395e-05, + "loss": 2.8138, + "step": 4066500 + }, + { + "epoch": 1.26428616948025, + "grad_norm": 8.7872314453125, + "learning_rate": 2.8928563841995837e-05, + "loss": 2.8334, + "step": 4067000 + }, + { + "epoch": 1.264441601760737, + "grad_norm": 9.104060173034668, + "learning_rate": 2.8925973303987717e-05, + "loss": 2.8502, + "step": 4067500 + }, + { + "epoch": 1.2645970340412238, + "grad_norm": 10.423410415649414, + "learning_rate": 2.8923382765979608e-05, + "loss": 2.8607, + "step": 4068000 + }, + { + "epoch": 1.2647524663217107, + "grad_norm": 17.440935134887695, + "learning_rate": 2.892079222797149e-05, + "loss": 2.8477, + "step": 4068500 + }, + { + "epoch": 1.2649078986021975, + "grad_norm": 8.119277000427246, + "learning_rate": 2.8918201689963376e-05, + "loss": 2.7555, + "step": 4069000 + }, + { + "epoch": 1.2650633308826844, + "grad_norm": 11.617997169494629, + "learning_rate": 2.8915611151955263e-05, + "loss": 2.8618, + "step": 4069500 + }, + { + "epoch": 1.2652187631631713, + "grad_norm": 8.837424278259277, + "learning_rate": 2.8913020613947146e-05, + "loss": 2.8565, + "step": 4070000 + }, + { + "epoch": 1.2653741954436581, + "grad_norm": 8.962926864624023, + "learning_rate": 2.8910430075939034e-05, + "loss": 2.7966, + "step": 4070500 + }, + { + "epoch": 1.265529627724145, + "grad_norm": 9.163000106811523, + "learning_rate": 2.890783953793092e-05, + "loss": 2.7883, + "step": 4071000 + }, + { + "epoch": 1.2656850600046319, + "grad_norm": 8.157411575317383, + "learning_rate": 2.89052489999228e-05, + "loss": 2.8155, + "step": 4071500 + }, + { + "epoch": 1.2658404922851187, + "grad_norm": 12.108642578125, + "learning_rate": 2.8902658461914688e-05, + "loss": 2.8233, + "step": 4072000 + }, + { + "epoch": 1.2659959245656056, + "grad_norm": 10.666576385498047, + "learning_rate": 2.8900067923906575e-05, + "loss": 2.8275, + "step": 4072500 + }, + { + "epoch": 1.2661513568460925, + "grad_norm": 7.967243671417236, + "learning_rate": 2.889747738589846e-05, + "loss": 2.8182, + "step": 4073000 + }, + { + "epoch": 1.2663067891265793, + "grad_norm": 11.522161483764648, + "learning_rate": 2.8894886847890346e-05, + "loss": 2.8184, + "step": 4073500 + }, + { + "epoch": 1.2664622214070662, + "grad_norm": 7.947514533996582, + "learning_rate": 2.8892296309882233e-05, + "loss": 2.8401, + "step": 4074000 + }, + { + "epoch": 1.266617653687553, + "grad_norm": 12.782171249389648, + "learning_rate": 2.8889705771874114e-05, + "loss": 2.8607, + "step": 4074500 + }, + { + "epoch": 1.26677308596804, + "grad_norm": 5.240906238555908, + "learning_rate": 2.8887115233866e-05, + "loss": 2.8283, + "step": 4075000 + }, + { + "epoch": 1.2669285182485268, + "grad_norm": 13.039813995361328, + "learning_rate": 2.8884524695857885e-05, + "loss": 2.8256, + "step": 4075500 + }, + { + "epoch": 1.2670839505290137, + "grad_norm": 7.441792011260986, + "learning_rate": 2.8881934157849772e-05, + "loss": 2.8068, + "step": 4076000 + }, + { + "epoch": 1.2672393828095005, + "grad_norm": 8.9617280960083, + "learning_rate": 2.887934361984166e-05, + "loss": 2.8272, + "step": 4076500 + }, + { + "epoch": 1.2673948150899874, + "grad_norm": 7.892889022827148, + "learning_rate": 2.887675308183354e-05, + "loss": 2.8609, + "step": 4077000 + }, + { + "epoch": 1.2675502473704743, + "grad_norm": 7.376807689666748, + "learning_rate": 2.8874162543825427e-05, + "loss": 2.8714, + "step": 4077500 + }, + { + "epoch": 1.2677056796509611, + "grad_norm": 9.877724647521973, + "learning_rate": 2.8871572005817317e-05, + "loss": 2.7878, + "step": 4078000 + }, + { + "epoch": 1.267861111931448, + "grad_norm": 10.43759536743164, + "learning_rate": 2.8868981467809198e-05, + "loss": 2.8045, + "step": 4078500 + }, + { + "epoch": 1.268016544211935, + "grad_norm": 10.215715408325195, + "learning_rate": 2.8866390929801085e-05, + "loss": 2.8598, + "step": 4079000 + }, + { + "epoch": 1.268171976492422, + "grad_norm": 11.248668670654297, + "learning_rate": 2.8863800391792972e-05, + "loss": 2.818, + "step": 4079500 + }, + { + "epoch": 1.2683274087729088, + "grad_norm": 8.957988739013672, + "learning_rate": 2.8861209853784856e-05, + "loss": 2.82, + "step": 4080000 + }, + { + "epoch": 1.2684828410533957, + "grad_norm": 7.035200595855713, + "learning_rate": 2.8858619315776743e-05, + "loss": 2.8573, + "step": 4080500 + }, + { + "epoch": 1.2686382733338826, + "grad_norm": 9.813728332519531, + "learning_rate": 2.8856028777768623e-05, + "loss": 2.8055, + "step": 4081000 + }, + { + "epoch": 1.2687937056143694, + "grad_norm": 10.255402565002441, + "learning_rate": 2.885343823976051e-05, + "loss": 2.8091, + "step": 4081500 + }, + { + "epoch": 1.2689491378948563, + "grad_norm": 10.138693809509277, + "learning_rate": 2.8850847701752397e-05, + "loss": 2.8231, + "step": 4082000 + }, + { + "epoch": 1.2691045701753432, + "grad_norm": 8.905945777893066, + "learning_rate": 2.884825716374428e-05, + "loss": 2.8594, + "step": 4082500 + }, + { + "epoch": 1.26926000245583, + "grad_norm": 9.324920654296875, + "learning_rate": 2.884566662573617e-05, + "loss": 2.799, + "step": 4083000 + }, + { + "epoch": 1.269415434736317, + "grad_norm": 10.97515869140625, + "learning_rate": 2.8843076087728056e-05, + "loss": 2.8118, + "step": 4083500 + }, + { + "epoch": 1.2695708670168038, + "grad_norm": 8.595675468444824, + "learning_rate": 2.8840485549719936e-05, + "loss": 2.8008, + "step": 4084000 + }, + { + "epoch": 1.2697262992972906, + "grad_norm": 8.306610107421875, + "learning_rate": 2.8837895011711823e-05, + "loss": 2.7981, + "step": 4084500 + }, + { + "epoch": 1.2698817315777775, + "grad_norm": 10.202409744262695, + "learning_rate": 2.883530447370371e-05, + "loss": 2.7997, + "step": 4085000 + }, + { + "epoch": 1.2700371638582644, + "grad_norm": 11.154422760009766, + "learning_rate": 2.8832713935695594e-05, + "loss": 2.7567, + "step": 4085500 + }, + { + "epoch": 1.2701925961387512, + "grad_norm": 10.511754035949707, + "learning_rate": 2.883012339768748e-05, + "loss": 2.8285, + "step": 4086000 + }, + { + "epoch": 1.270348028419238, + "grad_norm": 9.691906929016113, + "learning_rate": 2.882753285967936e-05, + "loss": 2.8158, + "step": 4086500 + }, + { + "epoch": 1.270503460699725, + "grad_norm": 6.9626359939575195, + "learning_rate": 2.882494232167125e-05, + "loss": 2.8708, + "step": 4087000 + }, + { + "epoch": 1.2706588929802118, + "grad_norm": 9.043818473815918, + "learning_rate": 2.8822351783663136e-05, + "loss": 2.8055, + "step": 4087500 + }, + { + "epoch": 1.270814325260699, + "grad_norm": 8.66968822479248, + "learning_rate": 2.881976124565502e-05, + "loss": 2.8463, + "step": 4088000 + }, + { + "epoch": 1.2709697575411858, + "grad_norm": 16.185935974121094, + "learning_rate": 2.8817170707646907e-05, + "loss": 2.8098, + "step": 4088500 + }, + { + "epoch": 1.2711251898216727, + "grad_norm": 8.223402976989746, + "learning_rate": 2.8814580169638794e-05, + "loss": 2.8432, + "step": 4089000 + }, + { + "epoch": 1.2712806221021595, + "grad_norm": 9.746476173400879, + "learning_rate": 2.8811989631630674e-05, + "loss": 2.812, + "step": 4089500 + }, + { + "epoch": 1.2714360543826464, + "grad_norm": 9.159024238586426, + "learning_rate": 2.8809399093622565e-05, + "loss": 2.8197, + "step": 4090000 + }, + { + "epoch": 1.2715914866631333, + "grad_norm": 8.449442863464355, + "learning_rate": 2.8806808555614452e-05, + "loss": 2.8487, + "step": 4090500 + }, + { + "epoch": 1.2717469189436201, + "grad_norm": 6.698483943939209, + "learning_rate": 2.8804218017606332e-05, + "loss": 2.8681, + "step": 4091000 + }, + { + "epoch": 1.271902351224107, + "grad_norm": 17.70322036743164, + "learning_rate": 2.880162747959822e-05, + "loss": 2.8408, + "step": 4091500 + }, + { + "epoch": 1.2720577835045939, + "grad_norm": 56.10185241699219, + "learning_rate": 2.8799036941590107e-05, + "loss": 2.7941, + "step": 4092000 + }, + { + "epoch": 1.2722132157850807, + "grad_norm": 10.723709106445312, + "learning_rate": 2.879644640358199e-05, + "loss": 2.7913, + "step": 4092500 + }, + { + "epoch": 1.2723686480655676, + "grad_norm": 8.875445365905762, + "learning_rate": 2.8793855865573878e-05, + "loss": 2.8252, + "step": 4093000 + }, + { + "epoch": 1.2725240803460545, + "grad_norm": 10.08558177947998, + "learning_rate": 2.8791265327565758e-05, + "loss": 2.8546, + "step": 4093500 + }, + { + "epoch": 1.2726795126265413, + "grad_norm": 18.190969467163086, + "learning_rate": 2.8788674789557645e-05, + "loss": 2.8235, + "step": 4094000 + }, + { + "epoch": 1.2728349449070282, + "grad_norm": 10.761018753051758, + "learning_rate": 2.8786084251549532e-05, + "loss": 2.8323, + "step": 4094500 + }, + { + "epoch": 1.272990377187515, + "grad_norm": 10.247227668762207, + "learning_rate": 2.8783493713541416e-05, + "loss": 2.8127, + "step": 4095000 + }, + { + "epoch": 1.273145809468002, + "grad_norm": 8.948396682739258, + "learning_rate": 2.8780903175533303e-05, + "loss": 2.8421, + "step": 4095500 + }, + { + "epoch": 1.2733012417484888, + "grad_norm": 13.714016914367676, + "learning_rate": 2.877831263752519e-05, + "loss": 2.8373, + "step": 4096000 + }, + { + "epoch": 1.2734566740289757, + "grad_norm": 13.524951934814453, + "learning_rate": 2.877572209951707e-05, + "loss": 2.8522, + "step": 4096500 + }, + { + "epoch": 1.2736121063094625, + "grad_norm": 8.303257942199707, + "learning_rate": 2.8773131561508958e-05, + "loss": 2.8292, + "step": 4097000 + }, + { + "epoch": 1.2737675385899494, + "grad_norm": 10.532401084899902, + "learning_rate": 2.8770541023500845e-05, + "loss": 2.8189, + "step": 4097500 + }, + { + "epoch": 1.2739229708704363, + "grad_norm": 11.913182258605957, + "learning_rate": 2.876795048549273e-05, + "loss": 2.844, + "step": 4098000 + }, + { + "epoch": 1.2740784031509231, + "grad_norm": 12.412446022033691, + "learning_rate": 2.8765359947484616e-05, + "loss": 2.823, + "step": 4098500 + }, + { + "epoch": 1.27423383543141, + "grad_norm": 10.090777397155762, + "learning_rate": 2.8762769409476496e-05, + "loss": 2.7728, + "step": 4099000 + }, + { + "epoch": 1.2743892677118969, + "grad_norm": 11.818242073059082, + "learning_rate": 2.8760178871468383e-05, + "loss": 2.8472, + "step": 4099500 + }, + { + "epoch": 1.2745446999923837, + "grad_norm": 8.184767723083496, + "learning_rate": 2.8757588333460274e-05, + "loss": 2.8095, + "step": 4100000 + }, + { + "epoch": 1.2747001322728706, + "grad_norm": 10.772643089294434, + "learning_rate": 2.8754997795452154e-05, + "loss": 2.7991, + "step": 4100500 + }, + { + "epoch": 1.2748555645533575, + "grad_norm": 9.515210151672363, + "learning_rate": 2.875240725744404e-05, + "loss": 2.8505, + "step": 4101000 + }, + { + "epoch": 1.2750109968338443, + "grad_norm": 10.438654899597168, + "learning_rate": 2.874981671943593e-05, + "loss": 2.8396, + "step": 4101500 + }, + { + "epoch": 1.2751664291143312, + "grad_norm": 12.317203521728516, + "learning_rate": 2.8747226181427812e-05, + "loss": 2.8264, + "step": 4102000 + }, + { + "epoch": 1.275321861394818, + "grad_norm": 11.395676612854004, + "learning_rate": 2.87446356434197e-05, + "loss": 2.8287, + "step": 4102500 + }, + { + "epoch": 1.2754772936753052, + "grad_norm": 8.854578971862793, + "learning_rate": 2.8742045105411587e-05, + "loss": 2.8365, + "step": 4103000 + }, + { + "epoch": 1.275632725955792, + "grad_norm": 8.065942764282227, + "learning_rate": 2.8739454567403467e-05, + "loss": 2.8043, + "step": 4103500 + }, + { + "epoch": 1.275788158236279, + "grad_norm": 8.952411651611328, + "learning_rate": 2.8736864029395354e-05, + "loss": 2.8145, + "step": 4104000 + }, + { + "epoch": 1.2759435905167658, + "grad_norm": 22.617549896240234, + "learning_rate": 2.8734273491387238e-05, + "loss": 2.8487, + "step": 4104500 + }, + { + "epoch": 1.2760990227972526, + "grad_norm": 9.911252975463867, + "learning_rate": 2.8731682953379125e-05, + "loss": 2.8012, + "step": 4105000 + }, + { + "epoch": 1.2762544550777395, + "grad_norm": 8.47885513305664, + "learning_rate": 2.8729092415371012e-05, + "loss": 2.8245, + "step": 4105500 + }, + { + "epoch": 1.2764098873582264, + "grad_norm": 16.62550926208496, + "learning_rate": 2.8726501877362893e-05, + "loss": 2.8294, + "step": 4106000 + }, + { + "epoch": 1.2765653196387132, + "grad_norm": 14.747027397155762, + "learning_rate": 2.872391133935478e-05, + "loss": 2.8609, + "step": 4106500 + }, + { + "epoch": 1.2767207519192, + "grad_norm": 12.410935401916504, + "learning_rate": 2.8721320801346667e-05, + "loss": 2.8343, + "step": 4107000 + }, + { + "epoch": 1.276876184199687, + "grad_norm": 9.530906677246094, + "learning_rate": 2.871873026333855e-05, + "loss": 2.8061, + "step": 4107500 + }, + { + "epoch": 1.2770316164801738, + "grad_norm": 10.297438621520996, + "learning_rate": 2.8716139725330438e-05, + "loss": 2.7878, + "step": 4108000 + }, + { + "epoch": 1.2771870487606607, + "grad_norm": 7.315313816070557, + "learning_rate": 2.8713549187322325e-05, + "loss": 2.8398, + "step": 4108500 + }, + { + "epoch": 1.2773424810411476, + "grad_norm": 7.775506973266602, + "learning_rate": 2.8710958649314205e-05, + "loss": 2.8071, + "step": 4109000 + }, + { + "epoch": 1.2774979133216344, + "grad_norm": 11.213077545166016, + "learning_rate": 2.8708368111306093e-05, + "loss": 2.8347, + "step": 4109500 + }, + { + "epoch": 1.2776533456021213, + "grad_norm": 9.124944686889648, + "learning_rate": 2.8705777573297983e-05, + "loss": 2.8492, + "step": 4110000 + }, + { + "epoch": 1.2778087778826082, + "grad_norm": 8.657026290893555, + "learning_rate": 2.8703187035289864e-05, + "loss": 2.8388, + "step": 4110500 + }, + { + "epoch": 1.277964210163095, + "grad_norm": 9.423057556152344, + "learning_rate": 2.870059649728175e-05, + "loss": 2.8276, + "step": 4111000 + }, + { + "epoch": 1.278119642443582, + "grad_norm": 11.570418357849121, + "learning_rate": 2.8698005959273634e-05, + "loss": 2.804, + "step": 4111500 + }, + { + "epoch": 1.278275074724069, + "grad_norm": 8.60379409790039, + "learning_rate": 2.869541542126552e-05, + "loss": 2.8313, + "step": 4112000 + }, + { + "epoch": 1.2784305070045558, + "grad_norm": 7.4307379722595215, + "learning_rate": 2.869282488325741e-05, + "loss": 2.8289, + "step": 4112500 + }, + { + "epoch": 1.2785859392850427, + "grad_norm": 11.690284729003906, + "learning_rate": 2.869023434524929e-05, + "loss": 2.8583, + "step": 4113000 + }, + { + "epoch": 1.2787413715655296, + "grad_norm": 8.680842399597168, + "learning_rate": 2.8687643807241176e-05, + "loss": 2.7991, + "step": 4113500 + }, + { + "epoch": 1.2788968038460165, + "grad_norm": 10.245673179626465, + "learning_rate": 2.8685053269233063e-05, + "loss": 2.8194, + "step": 4114000 + }, + { + "epoch": 1.2790522361265033, + "grad_norm": 17.431299209594727, + "learning_rate": 2.8682462731224947e-05, + "loss": 2.8516, + "step": 4114500 + }, + { + "epoch": 1.2792076684069902, + "grad_norm": 8.473666191101074, + "learning_rate": 2.8679872193216834e-05, + "loss": 2.8325, + "step": 4115000 + }, + { + "epoch": 1.279363100687477, + "grad_norm": 8.732259750366211, + "learning_rate": 2.867728165520872e-05, + "loss": 2.8653, + "step": 4115500 + }, + { + "epoch": 1.279518532967964, + "grad_norm": 9.321239471435547, + "learning_rate": 2.8674691117200602e-05, + "loss": 2.835, + "step": 4116000 + }, + { + "epoch": 1.2796739652484508, + "grad_norm": 12.073358535766602, + "learning_rate": 2.867210057919249e-05, + "loss": 2.8041, + "step": 4116500 + }, + { + "epoch": 1.2798293975289377, + "grad_norm": 7.99536657333374, + "learning_rate": 2.8669510041184373e-05, + "loss": 2.8006, + "step": 4117000 + }, + { + "epoch": 1.2799848298094245, + "grad_norm": 16.762727737426758, + "learning_rate": 2.866691950317626e-05, + "loss": 2.8162, + "step": 4117500 + }, + { + "epoch": 1.2801402620899114, + "grad_norm": 7.610921859741211, + "learning_rate": 2.8664328965168147e-05, + "loss": 2.8307, + "step": 4118000 + }, + { + "epoch": 1.2802956943703983, + "grad_norm": 9.102502822875977, + "learning_rate": 2.8661738427160028e-05, + "loss": 2.8351, + "step": 4118500 + }, + { + "epoch": 1.2804511266508851, + "grad_norm": 8.990608215332031, + "learning_rate": 2.8659147889151915e-05, + "loss": 2.7871, + "step": 4119000 + }, + { + "epoch": 1.280606558931372, + "grad_norm": 9.044364929199219, + "learning_rate": 2.8656557351143802e-05, + "loss": 2.8304, + "step": 4119500 + }, + { + "epoch": 1.2807619912118589, + "grad_norm": 11.441033363342285, + "learning_rate": 2.8653966813135686e-05, + "loss": 2.8695, + "step": 4120000 + }, + { + "epoch": 1.2809174234923457, + "grad_norm": 8.201330184936523, + "learning_rate": 2.8651376275127573e-05, + "loss": 2.8028, + "step": 4120500 + }, + { + "epoch": 1.2810728557728326, + "grad_norm": 9.502473831176758, + "learning_rate": 2.864878573711946e-05, + "loss": 2.8, + "step": 4121000 + }, + { + "epoch": 1.2812282880533195, + "grad_norm": 7.8317155838012695, + "learning_rate": 2.8646195199111344e-05, + "loss": 2.866, + "step": 4121500 + }, + { + "epoch": 1.2813837203338063, + "grad_norm": 6.915821552276611, + "learning_rate": 2.864360466110323e-05, + "loss": 2.8175, + "step": 4122000 + }, + { + "epoch": 1.2815391526142932, + "grad_norm": 8.666728973388672, + "learning_rate": 2.8641014123095118e-05, + "loss": 2.8201, + "step": 4122500 + }, + { + "epoch": 1.28169458489478, + "grad_norm": 10.34284496307373, + "learning_rate": 2.8638423585087e-05, + "loss": 2.82, + "step": 4123000 + }, + { + "epoch": 1.281850017175267, + "grad_norm": 8.01115608215332, + "learning_rate": 2.8635833047078885e-05, + "loss": 2.7902, + "step": 4123500 + }, + { + "epoch": 1.2820054494557538, + "grad_norm": 9.233762741088867, + "learning_rate": 2.863324250907077e-05, + "loss": 2.8246, + "step": 4124000 + }, + { + "epoch": 1.2821608817362407, + "grad_norm": 7.850268840789795, + "learning_rate": 2.8630651971062656e-05, + "loss": 2.8585, + "step": 4124500 + }, + { + "epoch": 1.2823163140167275, + "grad_norm": 8.22705364227295, + "learning_rate": 2.8628061433054544e-05, + "loss": 2.8385, + "step": 4125000 + }, + { + "epoch": 1.2824717462972144, + "grad_norm": 7.601090431213379, + "learning_rate": 2.8625470895046424e-05, + "loss": 2.8483, + "step": 4125500 + }, + { + "epoch": 1.2826271785777013, + "grad_norm": 9.977873802185059, + "learning_rate": 2.862288035703831e-05, + "loss": 2.7878, + "step": 4126000 + }, + { + "epoch": 1.2827826108581881, + "grad_norm": 9.255353927612305, + "learning_rate": 2.8620289819030198e-05, + "loss": 2.8261, + "step": 4126500 + }, + { + "epoch": 1.282938043138675, + "grad_norm": 11.769827842712402, + "learning_rate": 2.8617699281022082e-05, + "loss": 2.8315, + "step": 4127000 + }, + { + "epoch": 1.283093475419162, + "grad_norm": 9.57890510559082, + "learning_rate": 2.861510874301397e-05, + "loss": 2.8176, + "step": 4127500 + }, + { + "epoch": 1.283248907699649, + "grad_norm": 6.816061973571777, + "learning_rate": 2.8612518205005856e-05, + "loss": 2.7989, + "step": 4128000 + }, + { + "epoch": 1.2834043399801358, + "grad_norm": 16.295854568481445, + "learning_rate": 2.8609927666997737e-05, + "loss": 2.8403, + "step": 4128500 + }, + { + "epoch": 1.2835597722606227, + "grad_norm": 11.854782104492188, + "learning_rate": 2.8607337128989624e-05, + "loss": 2.7899, + "step": 4129000 + }, + { + "epoch": 1.2837152045411095, + "grad_norm": 8.925942420959473, + "learning_rate": 2.8604746590981508e-05, + "loss": 2.8352, + "step": 4129500 + }, + { + "epoch": 1.2838706368215964, + "grad_norm": 7.073259353637695, + "learning_rate": 2.8602156052973395e-05, + "loss": 2.8307, + "step": 4130000 + }, + { + "epoch": 1.2840260691020833, + "grad_norm": 10.527480125427246, + "learning_rate": 2.8599565514965282e-05, + "loss": 2.7948, + "step": 4130500 + }, + { + "epoch": 1.2841815013825701, + "grad_norm": 24.941783905029297, + "learning_rate": 2.8596974976957162e-05, + "loss": 2.7963, + "step": 4131000 + }, + { + "epoch": 1.284336933663057, + "grad_norm": 8.552519798278809, + "learning_rate": 2.8594384438949053e-05, + "loss": 2.8343, + "step": 4131500 + }, + { + "epoch": 1.2844923659435439, + "grad_norm": 9.508756637573242, + "learning_rate": 2.859179390094094e-05, + "loss": 2.8281, + "step": 4132000 + }, + { + "epoch": 1.2846477982240307, + "grad_norm": 8.726760864257812, + "learning_rate": 2.858920336293282e-05, + "loss": 2.7998, + "step": 4132500 + }, + { + "epoch": 1.2848032305045176, + "grad_norm": 9.211379051208496, + "learning_rate": 2.8586612824924708e-05, + "loss": 2.8811, + "step": 4133000 + }, + { + "epoch": 1.2849586627850045, + "grad_norm": 28.730926513671875, + "learning_rate": 2.8584022286916595e-05, + "loss": 2.8296, + "step": 4133500 + }, + { + "epoch": 1.2851140950654913, + "grad_norm": 8.614401817321777, + "learning_rate": 2.858143174890848e-05, + "loss": 2.8032, + "step": 4134000 + }, + { + "epoch": 1.2852695273459782, + "grad_norm": 9.788067817687988, + "learning_rate": 2.8578841210900366e-05, + "loss": 2.8178, + "step": 4134500 + }, + { + "epoch": 1.285424959626465, + "grad_norm": 10.061695098876953, + "learning_rate": 2.8576250672892246e-05, + "loss": 2.8065, + "step": 4135000 + }, + { + "epoch": 1.285580391906952, + "grad_norm": 8.054530143737793, + "learning_rate": 2.8573660134884133e-05, + "loss": 2.8053, + "step": 4135500 + }, + { + "epoch": 1.2857358241874388, + "grad_norm": 9.261175155639648, + "learning_rate": 2.857106959687602e-05, + "loss": 2.8138, + "step": 4136000 + }, + { + "epoch": 1.285891256467926, + "grad_norm": 18.358274459838867, + "learning_rate": 2.8568479058867904e-05, + "loss": 2.8256, + "step": 4136500 + }, + { + "epoch": 1.2860466887484128, + "grad_norm": 15.97603702545166, + "learning_rate": 2.856588852085979e-05, + "loss": 2.8329, + "step": 4137000 + }, + { + "epoch": 1.2862021210288996, + "grad_norm": 8.710835456848145, + "learning_rate": 2.856329798285168e-05, + "loss": 2.8458, + "step": 4137500 + }, + { + "epoch": 1.2863575533093865, + "grad_norm": 12.29102897644043, + "learning_rate": 2.856070744484356e-05, + "loss": 2.8882, + "step": 4138000 + }, + { + "epoch": 1.2865129855898734, + "grad_norm": 35.47603225708008, + "learning_rate": 2.8558116906835446e-05, + "loss": 2.7777, + "step": 4138500 + }, + { + "epoch": 1.2866684178703602, + "grad_norm": 6.87654447555542, + "learning_rate": 2.8555526368827333e-05, + "loss": 2.8035, + "step": 4139000 + }, + { + "epoch": 1.286823850150847, + "grad_norm": 12.660131454467773, + "learning_rate": 2.8552935830819217e-05, + "loss": 2.8262, + "step": 4139500 + }, + { + "epoch": 1.286979282431334, + "grad_norm": 9.511983871459961, + "learning_rate": 2.8550345292811104e-05, + "loss": 2.7857, + "step": 4140000 + }, + { + "epoch": 1.2871347147118208, + "grad_norm": 9.015130043029785, + "learning_rate": 2.854775475480299e-05, + "loss": 2.813, + "step": 4140500 + }, + { + "epoch": 1.2872901469923077, + "grad_norm": 10.113776206970215, + "learning_rate": 2.854516421679487e-05, + "loss": 2.7954, + "step": 4141000 + }, + { + "epoch": 1.2874455792727946, + "grad_norm": 16.58985137939453, + "learning_rate": 2.8542573678786762e-05, + "loss": 2.7991, + "step": 4141500 + }, + { + "epoch": 1.2876010115532814, + "grad_norm": 8.525391578674316, + "learning_rate": 2.8539983140778642e-05, + "loss": 2.8389, + "step": 4142000 + }, + { + "epoch": 1.2877564438337683, + "grad_norm": 8.672581672668457, + "learning_rate": 2.853739260277053e-05, + "loss": 2.8608, + "step": 4142500 + }, + { + "epoch": 1.2879118761142552, + "grad_norm": 9.808234214782715, + "learning_rate": 2.8534802064762417e-05, + "loss": 2.8799, + "step": 4143000 + }, + { + "epoch": 1.288067308394742, + "grad_norm": 8.038228034973145, + "learning_rate": 2.85322115267543e-05, + "loss": 2.8242, + "step": 4143500 + }, + { + "epoch": 1.288222740675229, + "grad_norm": 26.59203338623047, + "learning_rate": 2.8529620988746188e-05, + "loss": 2.7722, + "step": 4144000 + }, + { + "epoch": 1.2883781729557158, + "grad_norm": 8.261322021484375, + "learning_rate": 2.8527030450738075e-05, + "loss": 2.7708, + "step": 4144500 + }, + { + "epoch": 1.2885336052362026, + "grad_norm": 7.626896858215332, + "learning_rate": 2.8524439912729955e-05, + "loss": 2.807, + "step": 4145000 + }, + { + "epoch": 1.2886890375166895, + "grad_norm": 8.576926231384277, + "learning_rate": 2.8521849374721842e-05, + "loss": 2.752, + "step": 4145500 + }, + { + "epoch": 1.2888444697971764, + "grad_norm": 11.397835731506348, + "learning_rate": 2.851925883671373e-05, + "loss": 2.8037, + "step": 4146000 + }, + { + "epoch": 1.2889999020776632, + "grad_norm": 9.856196403503418, + "learning_rate": 2.8516668298705613e-05, + "loss": 2.8124, + "step": 4146500 + }, + { + "epoch": 1.28915533435815, + "grad_norm": 8.849781036376953, + "learning_rate": 2.85140777606975e-05, + "loss": 2.8315, + "step": 4147000 + }, + { + "epoch": 1.289310766638637, + "grad_norm": 8.903855323791504, + "learning_rate": 2.851148722268938e-05, + "loss": 2.7932, + "step": 4147500 + }, + { + "epoch": 1.2894661989191238, + "grad_norm": 8.460314750671387, + "learning_rate": 2.8508896684681268e-05, + "loss": 2.8263, + "step": 4148000 + }, + { + "epoch": 1.2896216311996107, + "grad_norm": 8.126692771911621, + "learning_rate": 2.8506306146673155e-05, + "loss": 2.875, + "step": 4148500 + }, + { + "epoch": 1.2897770634800976, + "grad_norm": 10.388587951660156, + "learning_rate": 2.850371560866504e-05, + "loss": 2.8012, + "step": 4149000 + }, + { + "epoch": 1.2899324957605844, + "grad_norm": 18.158849716186523, + "learning_rate": 2.8501125070656926e-05, + "loss": 2.8233, + "step": 4149500 + }, + { + "epoch": 1.2900879280410713, + "grad_norm": 11.360271453857422, + "learning_rate": 2.8498534532648813e-05, + "loss": 2.8587, + "step": 4150000 + }, + { + "epoch": 1.2902433603215582, + "grad_norm": 11.005188941955566, + "learning_rate": 2.8495943994640694e-05, + "loss": 2.8193, + "step": 4150500 + }, + { + "epoch": 1.290398792602045, + "grad_norm": 8.628840446472168, + "learning_rate": 2.849335345663258e-05, + "loss": 2.766, + "step": 4151000 + }, + { + "epoch": 1.2905542248825321, + "grad_norm": 8.715170860290527, + "learning_rate": 2.849076291862447e-05, + "loss": 2.8478, + "step": 4151500 + }, + { + "epoch": 1.290709657163019, + "grad_norm": 8.773530960083008, + "learning_rate": 2.848817238061635e-05, + "loss": 2.8112, + "step": 4152000 + }, + { + "epoch": 1.2908650894435059, + "grad_norm": 9.652569770812988, + "learning_rate": 2.848558184260824e-05, + "loss": 2.8248, + "step": 4152500 + }, + { + "epoch": 1.2910205217239927, + "grad_norm": 35.589962005615234, + "learning_rate": 2.848299130460012e-05, + "loss": 2.8112, + "step": 4153000 + }, + { + "epoch": 1.2911759540044796, + "grad_norm": 9.333300590515137, + "learning_rate": 2.848040076659201e-05, + "loss": 2.8244, + "step": 4153500 + }, + { + "epoch": 1.2913313862849665, + "grad_norm": 9.499716758728027, + "learning_rate": 2.8477810228583897e-05, + "loss": 2.849, + "step": 4154000 + }, + { + "epoch": 1.2914868185654533, + "grad_norm": 9.723875045776367, + "learning_rate": 2.8475219690575777e-05, + "loss": 2.7813, + "step": 4154500 + }, + { + "epoch": 1.2916422508459402, + "grad_norm": 7.958575248718262, + "learning_rate": 2.8472629152567664e-05, + "loss": 2.7871, + "step": 4155000 + }, + { + "epoch": 1.291797683126427, + "grad_norm": 8.225167274475098, + "learning_rate": 2.847003861455955e-05, + "loss": 2.8158, + "step": 4155500 + }, + { + "epoch": 1.291953115406914, + "grad_norm": 13.385741233825684, + "learning_rate": 2.8467448076551435e-05, + "loss": 2.8148, + "step": 4156000 + }, + { + "epoch": 1.2921085476874008, + "grad_norm": 10.803699493408203, + "learning_rate": 2.8464857538543322e-05, + "loss": 2.826, + "step": 4156500 + }, + { + "epoch": 1.2922639799678877, + "grad_norm": 10.424778938293457, + "learning_rate": 2.846226700053521e-05, + "loss": 2.8403, + "step": 4157000 + }, + { + "epoch": 1.2924194122483745, + "grad_norm": 11.749484062194824, + "learning_rate": 2.845967646252709e-05, + "loss": 2.8224, + "step": 4157500 + }, + { + "epoch": 1.2925748445288614, + "grad_norm": 9.52691650390625, + "learning_rate": 2.8457085924518977e-05, + "loss": 2.8166, + "step": 4158000 + }, + { + "epoch": 1.2927302768093483, + "grad_norm": 33.89201736450195, + "learning_rate": 2.8454495386510864e-05, + "loss": 2.8652, + "step": 4158500 + }, + { + "epoch": 1.2928857090898351, + "grad_norm": 7.629056930541992, + "learning_rate": 2.8451904848502748e-05, + "loss": 2.8399, + "step": 4159000 + }, + { + "epoch": 1.293041141370322, + "grad_norm": 11.208063125610352, + "learning_rate": 2.8449314310494635e-05, + "loss": 2.7717, + "step": 4159500 + }, + { + "epoch": 1.2931965736508089, + "grad_norm": 10.586935997009277, + "learning_rate": 2.8446723772486516e-05, + "loss": 2.8098, + "step": 4160000 + }, + { + "epoch": 1.293352005931296, + "grad_norm": 8.980918884277344, + "learning_rate": 2.8444133234478403e-05, + "loss": 2.8197, + "step": 4160500 + }, + { + "epoch": 1.2935074382117828, + "grad_norm": 7.141735076904297, + "learning_rate": 2.844154269647029e-05, + "loss": 2.8156, + "step": 4161000 + }, + { + "epoch": 1.2936628704922697, + "grad_norm": 8.476053237915039, + "learning_rate": 2.8438952158462174e-05, + "loss": 2.8334, + "step": 4161500 + }, + { + "epoch": 1.2938183027727566, + "grad_norm": 8.559693336486816, + "learning_rate": 2.843636162045406e-05, + "loss": 2.8059, + "step": 4162000 + }, + { + "epoch": 1.2939737350532434, + "grad_norm": 8.780301094055176, + "learning_rate": 2.8433771082445948e-05, + "loss": 2.8741, + "step": 4162500 + }, + { + "epoch": 1.2941291673337303, + "grad_norm": 8.167019844055176, + "learning_rate": 2.8431180544437828e-05, + "loss": 2.8447, + "step": 4163000 + }, + { + "epoch": 1.2942845996142172, + "grad_norm": 8.170044898986816, + "learning_rate": 2.842859000642972e-05, + "loss": 2.7875, + "step": 4163500 + }, + { + "epoch": 1.294440031894704, + "grad_norm": 9.629762649536133, + "learning_rate": 2.8425999468421606e-05, + "loss": 2.8154, + "step": 4164000 + }, + { + "epoch": 1.294595464175191, + "grad_norm": 27.681650161743164, + "learning_rate": 2.8423408930413486e-05, + "loss": 2.8447, + "step": 4164500 + }, + { + "epoch": 1.2947508964556778, + "grad_norm": 11.877565383911133, + "learning_rate": 2.8420818392405374e-05, + "loss": 2.8453, + "step": 4165000 + }, + { + "epoch": 1.2949063287361646, + "grad_norm": 9.078178405761719, + "learning_rate": 2.8418227854397257e-05, + "loss": 2.7725, + "step": 4165500 + }, + { + "epoch": 1.2950617610166515, + "grad_norm": 8.271347045898438, + "learning_rate": 2.8415637316389144e-05, + "loss": 2.807, + "step": 4166000 + }, + { + "epoch": 1.2952171932971384, + "grad_norm": 8.094144821166992, + "learning_rate": 2.841304677838103e-05, + "loss": 2.8288, + "step": 4166500 + }, + { + "epoch": 1.2953726255776252, + "grad_norm": 11.666854858398438, + "learning_rate": 2.8410456240372912e-05, + "loss": 2.8125, + "step": 4167000 + }, + { + "epoch": 1.295528057858112, + "grad_norm": 8.10000228881836, + "learning_rate": 2.84078657023648e-05, + "loss": 2.818, + "step": 4167500 + }, + { + "epoch": 1.295683490138599, + "grad_norm": 8.978521347045898, + "learning_rate": 2.8405275164356686e-05, + "loss": 2.8269, + "step": 4168000 + }, + { + "epoch": 1.2958389224190858, + "grad_norm": 8.311487197875977, + "learning_rate": 2.840268462634857e-05, + "loss": 2.83, + "step": 4168500 + }, + { + "epoch": 1.2959943546995727, + "grad_norm": 12.103716850280762, + "learning_rate": 2.8400094088340457e-05, + "loss": 2.861, + "step": 4169000 + }, + { + "epoch": 1.2961497869800596, + "grad_norm": 8.455634117126465, + "learning_rate": 2.8397503550332344e-05, + "loss": 2.8514, + "step": 4169500 + }, + { + "epoch": 1.2963052192605464, + "grad_norm": 5.4632158279418945, + "learning_rate": 2.8394913012324225e-05, + "loss": 2.8262, + "step": 4170000 + }, + { + "epoch": 1.2964606515410333, + "grad_norm": 9.294608116149902, + "learning_rate": 2.8392322474316112e-05, + "loss": 2.8194, + "step": 4170500 + }, + { + "epoch": 1.2966160838215202, + "grad_norm": 8.985095977783203, + "learning_rate": 2.8389731936307996e-05, + "loss": 2.8427, + "step": 4171000 + }, + { + "epoch": 1.296771516102007, + "grad_norm": 9.401241302490234, + "learning_rate": 2.8387141398299883e-05, + "loss": 2.7927, + "step": 4171500 + }, + { + "epoch": 1.296926948382494, + "grad_norm": 10.726057052612305, + "learning_rate": 2.838455086029177e-05, + "loss": 2.8111, + "step": 4172000 + }, + { + "epoch": 1.2970823806629808, + "grad_norm": 9.86369514465332, + "learning_rate": 2.838196032228365e-05, + "loss": 2.7889, + "step": 4172500 + }, + { + "epoch": 1.2972378129434676, + "grad_norm": 14.539722442626953, + "learning_rate": 2.8379369784275537e-05, + "loss": 2.8384, + "step": 4173000 + }, + { + "epoch": 1.2973932452239545, + "grad_norm": 9.559749603271484, + "learning_rate": 2.8376779246267428e-05, + "loss": 2.8138, + "step": 4173500 + }, + { + "epoch": 1.2975486775044414, + "grad_norm": 8.104920387268066, + "learning_rate": 2.837418870825931e-05, + "loss": 2.8102, + "step": 4174000 + }, + { + "epoch": 1.2977041097849282, + "grad_norm": 20.99620246887207, + "learning_rate": 2.8371598170251196e-05, + "loss": 2.8431, + "step": 4174500 + }, + { + "epoch": 1.297859542065415, + "grad_norm": 8.974562644958496, + "learning_rate": 2.8369007632243083e-05, + "loss": 2.8987, + "step": 4175000 + }, + { + "epoch": 1.2980149743459022, + "grad_norm": 9.847721099853516, + "learning_rate": 2.8366417094234966e-05, + "loss": 2.8652, + "step": 4175500 + }, + { + "epoch": 1.298170406626389, + "grad_norm": 10.76940631866455, + "learning_rate": 2.8363826556226854e-05, + "loss": 2.8311, + "step": 4176000 + }, + { + "epoch": 1.298325838906876, + "grad_norm": 7.355706214904785, + "learning_rate": 2.836123601821874e-05, + "loss": 2.7751, + "step": 4176500 + }, + { + "epoch": 1.2984812711873628, + "grad_norm": 9.077010154724121, + "learning_rate": 2.835864548021062e-05, + "loss": 2.8076, + "step": 4177000 + }, + { + "epoch": 1.2986367034678497, + "grad_norm": 15.481781959533691, + "learning_rate": 2.835605494220251e-05, + "loss": 2.8282, + "step": 4177500 + }, + { + "epoch": 1.2987921357483365, + "grad_norm": 9.040555000305176, + "learning_rate": 2.8353464404194392e-05, + "loss": 2.7848, + "step": 4178000 + }, + { + "epoch": 1.2989475680288234, + "grad_norm": 13.4321928024292, + "learning_rate": 2.835087386618628e-05, + "loss": 2.8536, + "step": 4178500 + }, + { + "epoch": 1.2991030003093103, + "grad_norm": 11.09743881225586, + "learning_rate": 2.8348283328178166e-05, + "loss": 2.7749, + "step": 4179000 + }, + { + "epoch": 1.2992584325897971, + "grad_norm": 13.140623092651367, + "learning_rate": 2.8345692790170047e-05, + "loss": 2.8448, + "step": 4179500 + }, + { + "epoch": 1.299413864870284, + "grad_norm": 9.816787719726562, + "learning_rate": 2.8343102252161934e-05, + "loss": 2.8034, + "step": 4180000 + }, + { + "epoch": 1.2995692971507709, + "grad_norm": 6.672100067138672, + "learning_rate": 2.834051171415382e-05, + "loss": 2.8483, + "step": 4180500 + }, + { + "epoch": 1.2997247294312577, + "grad_norm": 9.991350173950195, + "learning_rate": 2.8337921176145705e-05, + "loss": 2.8077, + "step": 4181000 + }, + { + "epoch": 1.2998801617117446, + "grad_norm": 11.71463394165039, + "learning_rate": 2.8335330638137592e-05, + "loss": 2.7842, + "step": 4181500 + }, + { + "epoch": 1.3000355939922315, + "grad_norm": 23.307292938232422, + "learning_rate": 2.833274010012948e-05, + "loss": 2.7747, + "step": 4182000 + }, + { + "epoch": 1.3001910262727183, + "grad_norm": 9.175529479980469, + "learning_rate": 2.833014956212136e-05, + "loss": 2.7543, + "step": 4182500 + }, + { + "epoch": 1.3003464585532052, + "grad_norm": 16.172229766845703, + "learning_rate": 2.8327559024113247e-05, + "loss": 2.8024, + "step": 4183000 + }, + { + "epoch": 1.300501890833692, + "grad_norm": 9.489645957946777, + "learning_rate": 2.832496848610513e-05, + "loss": 2.8249, + "step": 4183500 + }, + { + "epoch": 1.300657323114179, + "grad_norm": 7.4192633628845215, + "learning_rate": 2.8322377948097018e-05, + "loss": 2.8393, + "step": 4184000 + }, + { + "epoch": 1.300812755394666, + "grad_norm": 10.19690227508545, + "learning_rate": 2.8319787410088905e-05, + "loss": 2.8316, + "step": 4184500 + }, + { + "epoch": 1.3009681876751529, + "grad_norm": 10.073777198791504, + "learning_rate": 2.8317196872080785e-05, + "loss": 2.814, + "step": 4185000 + }, + { + "epoch": 1.3011236199556397, + "grad_norm": 8.310441970825195, + "learning_rate": 2.8314606334072676e-05, + "loss": 2.7963, + "step": 4185500 + }, + { + "epoch": 1.3012790522361266, + "grad_norm": 8.508282661437988, + "learning_rate": 2.8312015796064563e-05, + "loss": 2.803, + "step": 4186000 + }, + { + "epoch": 1.3014344845166135, + "grad_norm": 10.362727165222168, + "learning_rate": 2.8309425258056443e-05, + "loss": 2.8051, + "step": 4186500 + }, + { + "epoch": 1.3015899167971003, + "grad_norm": 12.113873481750488, + "learning_rate": 2.830683472004833e-05, + "loss": 2.8024, + "step": 4187000 + }, + { + "epoch": 1.3017453490775872, + "grad_norm": 40.848968505859375, + "learning_rate": 2.8304244182040218e-05, + "loss": 2.7801, + "step": 4187500 + }, + { + "epoch": 1.301900781358074, + "grad_norm": 12.225676536560059, + "learning_rate": 2.83016536440321e-05, + "loss": 2.8207, + "step": 4188000 + }, + { + "epoch": 1.302056213638561, + "grad_norm": 11.769515991210938, + "learning_rate": 2.829906310602399e-05, + "loss": 2.7976, + "step": 4188500 + }, + { + "epoch": 1.3022116459190478, + "grad_norm": 10.09959602355957, + "learning_rate": 2.829647256801587e-05, + "loss": 2.8037, + "step": 4189000 + }, + { + "epoch": 1.3023670781995347, + "grad_norm": 6.909239768981934, + "learning_rate": 2.8293882030007756e-05, + "loss": 2.8, + "step": 4189500 + }, + { + "epoch": 1.3025225104800215, + "grad_norm": 8.540985107421875, + "learning_rate": 2.8291291491999643e-05, + "loss": 2.7557, + "step": 4190000 + }, + { + "epoch": 1.3026779427605084, + "grad_norm": 8.374438285827637, + "learning_rate": 2.8288700953991527e-05, + "loss": 2.8358, + "step": 4190500 + }, + { + "epoch": 1.3028333750409953, + "grad_norm": 9.30809497833252, + "learning_rate": 2.8286110415983414e-05, + "loss": 2.8208, + "step": 4191000 + }, + { + "epoch": 1.3029888073214821, + "grad_norm": 7.975707054138184, + "learning_rate": 2.82835198779753e-05, + "loss": 2.8089, + "step": 4191500 + }, + { + "epoch": 1.303144239601969, + "grad_norm": 12.645511627197266, + "learning_rate": 2.828092933996718e-05, + "loss": 2.822, + "step": 4192000 + }, + { + "epoch": 1.3032996718824559, + "grad_norm": 10.58139419555664, + "learning_rate": 2.827833880195907e-05, + "loss": 2.8104, + "step": 4192500 + }, + { + "epoch": 1.3034551041629427, + "grad_norm": 6.8170552253723145, + "learning_rate": 2.8275748263950956e-05, + "loss": 2.8072, + "step": 4193000 + }, + { + "epoch": 1.3036105364434296, + "grad_norm": 28.72083854675293, + "learning_rate": 2.827315772594284e-05, + "loss": 2.8727, + "step": 4193500 + }, + { + "epoch": 1.3037659687239165, + "grad_norm": 9.044017791748047, + "learning_rate": 2.8270567187934727e-05, + "loss": 2.8355, + "step": 4194000 + }, + { + "epoch": 1.3039214010044033, + "grad_norm": 11.416905403137207, + "learning_rate": 2.8267976649926614e-05, + "loss": 2.8056, + "step": 4194500 + }, + { + "epoch": 1.3040768332848902, + "grad_norm": 9.962327003479004, + "learning_rate": 2.8265386111918494e-05, + "loss": 2.7762, + "step": 4195000 + }, + { + "epoch": 1.304232265565377, + "grad_norm": 8.666047096252441, + "learning_rate": 2.8262795573910385e-05, + "loss": 2.8317, + "step": 4195500 + }, + { + "epoch": 1.304387697845864, + "grad_norm": 10.675084114074707, + "learning_rate": 2.8260205035902265e-05, + "loss": 2.7673, + "step": 4196000 + }, + { + "epoch": 1.3045431301263508, + "grad_norm": 12.932022094726562, + "learning_rate": 2.8257614497894152e-05, + "loss": 2.8216, + "step": 4196500 + }, + { + "epoch": 1.3046985624068377, + "grad_norm": 11.159161567687988, + "learning_rate": 2.825502395988604e-05, + "loss": 2.8151, + "step": 4197000 + }, + { + "epoch": 1.3048539946873245, + "grad_norm": 9.70851993560791, + "learning_rate": 2.8252433421877923e-05, + "loss": 2.816, + "step": 4197500 + }, + { + "epoch": 1.3050094269678114, + "grad_norm": 14.931695938110352, + "learning_rate": 2.824984288386981e-05, + "loss": 2.8087, + "step": 4198000 + }, + { + "epoch": 1.3051648592482983, + "grad_norm": 8.703289985656738, + "learning_rate": 2.8247252345861698e-05, + "loss": 2.8381, + "step": 4198500 + }, + { + "epoch": 1.3053202915287851, + "grad_norm": 11.911980628967285, + "learning_rate": 2.8244661807853578e-05, + "loss": 2.8233, + "step": 4199000 + }, + { + "epoch": 1.3054757238092722, + "grad_norm": 16.823528289794922, + "learning_rate": 2.8242071269845465e-05, + "loss": 2.7967, + "step": 4199500 + }, + { + "epoch": 1.305631156089759, + "grad_norm": 9.788041114807129, + "learning_rate": 2.8239480731837352e-05, + "loss": 2.8055, + "step": 4200000 + }, + { + "epoch": 1.305786588370246, + "grad_norm": 10.105781555175781, + "learning_rate": 2.8236890193829236e-05, + "loss": 2.8253, + "step": 4200500 + }, + { + "epoch": 1.3059420206507328, + "grad_norm": 10.40253734588623, + "learning_rate": 2.8234299655821123e-05, + "loss": 2.8187, + "step": 4201000 + }, + { + "epoch": 1.3060974529312197, + "grad_norm": 14.371759414672852, + "learning_rate": 2.8231709117813004e-05, + "loss": 2.8118, + "step": 4201500 + }, + { + "epoch": 1.3062528852117066, + "grad_norm": 7.703509330749512, + "learning_rate": 2.822911857980489e-05, + "loss": 2.9071, + "step": 4202000 + }, + { + "epoch": 1.3064083174921934, + "grad_norm": 9.45212173461914, + "learning_rate": 2.8226528041796778e-05, + "loss": 2.8004, + "step": 4202500 + }, + { + "epoch": 1.3065637497726803, + "grad_norm": 7.333556175231934, + "learning_rate": 2.822393750378866e-05, + "loss": 2.8292, + "step": 4203000 + }, + { + "epoch": 1.3067191820531672, + "grad_norm": 9.62678050994873, + "learning_rate": 2.822134696578055e-05, + "loss": 2.8099, + "step": 4203500 + }, + { + "epoch": 1.306874614333654, + "grad_norm": 9.028975486755371, + "learning_rate": 2.8218756427772436e-05, + "loss": 2.8043, + "step": 4204000 + }, + { + "epoch": 1.307030046614141, + "grad_norm": 11.768975257873535, + "learning_rate": 2.8216165889764316e-05, + "loss": 2.8155, + "step": 4204500 + }, + { + "epoch": 1.3071854788946278, + "grad_norm": 9.615663528442383, + "learning_rate": 2.8213575351756203e-05, + "loss": 2.8143, + "step": 4205000 + }, + { + "epoch": 1.3073409111751146, + "grad_norm": 9.643288612365723, + "learning_rate": 2.8210984813748094e-05, + "loss": 2.8319, + "step": 4205500 + }, + { + "epoch": 1.3074963434556015, + "grad_norm": 7.874513149261475, + "learning_rate": 2.8208394275739974e-05, + "loss": 2.8242, + "step": 4206000 + }, + { + "epoch": 1.3076517757360884, + "grad_norm": 7.859763145446777, + "learning_rate": 2.820580373773186e-05, + "loss": 2.8149, + "step": 4206500 + }, + { + "epoch": 1.3078072080165752, + "grad_norm": 10.970325469970703, + "learning_rate": 2.8203213199723745e-05, + "loss": 2.8159, + "step": 4207000 + }, + { + "epoch": 1.307962640297062, + "grad_norm": 6.747430324554443, + "learning_rate": 2.8200622661715632e-05, + "loss": 2.8103, + "step": 4207500 + }, + { + "epoch": 1.308118072577549, + "grad_norm": 10.032941818237305, + "learning_rate": 2.819803212370752e-05, + "loss": 2.7663, + "step": 4208000 + }, + { + "epoch": 1.308273504858036, + "grad_norm": 8.705611228942871, + "learning_rate": 2.81954415856994e-05, + "loss": 2.8498, + "step": 4208500 + }, + { + "epoch": 1.308428937138523, + "grad_norm": 9.070808410644531, + "learning_rate": 2.8192851047691287e-05, + "loss": 2.8094, + "step": 4209000 + }, + { + "epoch": 1.3085843694190098, + "grad_norm": 8.486721992492676, + "learning_rate": 2.8190260509683174e-05, + "loss": 2.7988, + "step": 4209500 + }, + { + "epoch": 1.3087398016994967, + "grad_norm": 21.32467269897461, + "learning_rate": 2.8187669971675058e-05, + "loss": 2.8033, + "step": 4210000 + }, + { + "epoch": 1.3088952339799835, + "grad_norm": 10.230703353881836, + "learning_rate": 2.8185079433666945e-05, + "loss": 2.8016, + "step": 4210500 + }, + { + "epoch": 1.3090506662604704, + "grad_norm": 37.845611572265625, + "learning_rate": 2.8182488895658832e-05, + "loss": 2.7895, + "step": 4211000 + }, + { + "epoch": 1.3092060985409573, + "grad_norm": 8.464642524719238, + "learning_rate": 2.8179898357650713e-05, + "loss": 2.7956, + "step": 4211500 + }, + { + "epoch": 1.3093615308214441, + "grad_norm": 8.168045997619629, + "learning_rate": 2.81773078196426e-05, + "loss": 2.7752, + "step": 4212000 + }, + { + "epoch": 1.309516963101931, + "grad_norm": 8.469412803649902, + "learning_rate": 2.8174717281634487e-05, + "loss": 2.8131, + "step": 4212500 + }, + { + "epoch": 1.3096723953824179, + "grad_norm": 9.491640090942383, + "learning_rate": 2.817212674362637e-05, + "loss": 2.8064, + "step": 4213000 + }, + { + "epoch": 1.3098278276629047, + "grad_norm": 8.551959037780762, + "learning_rate": 2.8169536205618258e-05, + "loss": 2.831, + "step": 4213500 + }, + { + "epoch": 1.3099832599433916, + "grad_norm": 9.992714881896973, + "learning_rate": 2.816694566761014e-05, + "loss": 2.7833, + "step": 4214000 + }, + { + "epoch": 1.3101386922238785, + "grad_norm": 5.952448844909668, + "learning_rate": 2.8164355129602026e-05, + "loss": 2.8166, + "step": 4214500 + }, + { + "epoch": 1.3102941245043653, + "grad_norm": 15.491366386413574, + "learning_rate": 2.8161764591593913e-05, + "loss": 2.7856, + "step": 4215000 + }, + { + "epoch": 1.3104495567848522, + "grad_norm": 9.56592845916748, + "learning_rate": 2.8159174053585796e-05, + "loss": 2.839, + "step": 4215500 + }, + { + "epoch": 1.310604989065339, + "grad_norm": 10.605376243591309, + "learning_rate": 2.8156583515577684e-05, + "loss": 2.7896, + "step": 4216000 + }, + { + "epoch": 1.310760421345826, + "grad_norm": 11.095876693725586, + "learning_rate": 2.815399297756957e-05, + "loss": 2.8306, + "step": 4216500 + }, + { + "epoch": 1.3109158536263128, + "grad_norm": 9.486709594726562, + "learning_rate": 2.8151402439561455e-05, + "loss": 2.772, + "step": 4217000 + }, + { + "epoch": 1.3110712859067997, + "grad_norm": 7.920121669769287, + "learning_rate": 2.814881190155334e-05, + "loss": 2.8648, + "step": 4217500 + }, + { + "epoch": 1.3112267181872865, + "grad_norm": 8.572959899902344, + "learning_rate": 2.814622136354523e-05, + "loss": 2.7837, + "step": 4218000 + }, + { + "epoch": 1.3113821504677734, + "grad_norm": 10.232078552246094, + "learning_rate": 2.814363082553711e-05, + "loss": 2.8061, + "step": 4218500 + }, + { + "epoch": 1.3115375827482603, + "grad_norm": 8.469193458557129, + "learning_rate": 2.8141040287528996e-05, + "loss": 2.7746, + "step": 4219000 + }, + { + "epoch": 1.3116930150287471, + "grad_norm": 9.271539688110352, + "learning_rate": 2.813844974952088e-05, + "loss": 2.8647, + "step": 4219500 + }, + { + "epoch": 1.311848447309234, + "grad_norm": 10.401216506958008, + "learning_rate": 2.8135859211512767e-05, + "loss": 2.8095, + "step": 4220000 + }, + { + "epoch": 1.3120038795897209, + "grad_norm": 10.68674373626709, + "learning_rate": 2.8133268673504654e-05, + "loss": 2.7792, + "step": 4220500 + }, + { + "epoch": 1.3121593118702077, + "grad_norm": 8.560074806213379, + "learning_rate": 2.8130678135496535e-05, + "loss": 2.8272, + "step": 4221000 + }, + { + "epoch": 1.3123147441506946, + "grad_norm": 6.2628302574157715, + "learning_rate": 2.8128087597488422e-05, + "loss": 2.793, + "step": 4221500 + }, + { + "epoch": 1.3124701764311815, + "grad_norm": 9.369773864746094, + "learning_rate": 2.812549705948031e-05, + "loss": 2.8718, + "step": 4222000 + }, + { + "epoch": 1.3126256087116683, + "grad_norm": 11.887232780456543, + "learning_rate": 2.8122906521472193e-05, + "loss": 2.7775, + "step": 4222500 + }, + { + "epoch": 1.3127810409921552, + "grad_norm": 9.870308876037598, + "learning_rate": 2.812031598346408e-05, + "loss": 2.8336, + "step": 4223000 + }, + { + "epoch": 1.3129364732726423, + "grad_norm": 8.776838302612305, + "learning_rate": 2.8117725445455967e-05, + "loss": 2.8064, + "step": 4223500 + }, + { + "epoch": 1.3130919055531292, + "grad_norm": 9.292623519897461, + "learning_rate": 2.8115134907447848e-05, + "loss": 2.8425, + "step": 4224000 + }, + { + "epoch": 1.313247337833616, + "grad_norm": 11.722243309020996, + "learning_rate": 2.8112544369439735e-05, + "loss": 2.8469, + "step": 4224500 + }, + { + "epoch": 1.313402770114103, + "grad_norm": 9.00864315032959, + "learning_rate": 2.810995383143162e-05, + "loss": 2.8031, + "step": 4225000 + }, + { + "epoch": 1.3135582023945898, + "grad_norm": 14.568188667297363, + "learning_rate": 2.8107363293423506e-05, + "loss": 2.8138, + "step": 4225500 + }, + { + "epoch": 1.3137136346750766, + "grad_norm": 10.818573951721191, + "learning_rate": 2.8104772755415393e-05, + "loss": 2.8311, + "step": 4226000 + }, + { + "epoch": 1.3138690669555635, + "grad_norm": 10.416548728942871, + "learning_rate": 2.8102182217407273e-05, + "loss": 2.8454, + "step": 4226500 + }, + { + "epoch": 1.3140244992360504, + "grad_norm": 7.885244846343994, + "learning_rate": 2.8099591679399164e-05, + "loss": 2.8311, + "step": 4227000 + }, + { + "epoch": 1.3141799315165372, + "grad_norm": 11.160770416259766, + "learning_rate": 2.809700114139105e-05, + "loss": 2.8463, + "step": 4227500 + }, + { + "epoch": 1.314335363797024, + "grad_norm": 13.701737403869629, + "learning_rate": 2.809441060338293e-05, + "loss": 2.8179, + "step": 4228000 + }, + { + "epoch": 1.314490796077511, + "grad_norm": 10.068864822387695, + "learning_rate": 2.809182006537482e-05, + "loss": 2.8298, + "step": 4228500 + }, + { + "epoch": 1.3146462283579978, + "grad_norm": 8.361946105957031, + "learning_rate": 2.8089229527366706e-05, + "loss": 2.7635, + "step": 4229000 + }, + { + "epoch": 1.3148016606384847, + "grad_norm": 7.623314380645752, + "learning_rate": 2.808663898935859e-05, + "loss": 2.8495, + "step": 4229500 + }, + { + "epoch": 1.3149570929189716, + "grad_norm": 7.309601783752441, + "learning_rate": 2.8084048451350476e-05, + "loss": 2.7952, + "step": 4230000 + }, + { + "epoch": 1.3151125251994584, + "grad_norm": 10.877706527709961, + "learning_rate": 2.8081457913342364e-05, + "loss": 2.857, + "step": 4230500 + }, + { + "epoch": 1.3152679574799453, + "grad_norm": 11.630607604980469, + "learning_rate": 2.8078867375334244e-05, + "loss": 2.7958, + "step": 4231000 + }, + { + "epoch": 1.3154233897604322, + "grad_norm": 10.301816940307617, + "learning_rate": 2.807627683732613e-05, + "loss": 2.8602, + "step": 4231500 + }, + { + "epoch": 1.315578822040919, + "grad_norm": 8.766130447387695, + "learning_rate": 2.8073686299318015e-05, + "loss": 2.8506, + "step": 4232000 + }, + { + "epoch": 1.3157342543214061, + "grad_norm": 10.455126762390137, + "learning_rate": 2.8071095761309902e-05, + "loss": 2.8247, + "step": 4232500 + }, + { + "epoch": 1.315889686601893, + "grad_norm": 8.74099349975586, + "learning_rate": 2.806850522330179e-05, + "loss": 2.8407, + "step": 4233000 + }, + { + "epoch": 1.3160451188823798, + "grad_norm": 8.85863208770752, + "learning_rate": 2.806591468529367e-05, + "loss": 2.7929, + "step": 4233500 + }, + { + "epoch": 1.3162005511628667, + "grad_norm": 10.58686351776123, + "learning_rate": 2.8063324147285557e-05, + "loss": 2.8371, + "step": 4234000 + }, + { + "epoch": 1.3163559834433536, + "grad_norm": 8.208544731140137, + "learning_rate": 2.8060733609277444e-05, + "loss": 2.7825, + "step": 4234500 + }, + { + "epoch": 1.3165114157238405, + "grad_norm": 7.659417629241943, + "learning_rate": 2.8058143071269328e-05, + "loss": 2.7714, + "step": 4235000 + }, + { + "epoch": 1.3166668480043273, + "grad_norm": 8.745527267456055, + "learning_rate": 2.8055552533261215e-05, + "loss": 2.8, + "step": 4235500 + }, + { + "epoch": 1.3168222802848142, + "grad_norm": 13.574899673461914, + "learning_rate": 2.8052961995253102e-05, + "loss": 2.8487, + "step": 4236000 + }, + { + "epoch": 1.316977712565301, + "grad_norm": 9.734295845031738, + "learning_rate": 2.8050371457244982e-05, + "loss": 2.7888, + "step": 4236500 + }, + { + "epoch": 1.317133144845788, + "grad_norm": 12.157238006591797, + "learning_rate": 2.8047780919236873e-05, + "loss": 2.8379, + "step": 4237000 + }, + { + "epoch": 1.3172885771262748, + "grad_norm": 9.599797248840332, + "learning_rate": 2.8045190381228753e-05, + "loss": 2.7544, + "step": 4237500 + }, + { + "epoch": 1.3174440094067617, + "grad_norm": 6.439640998840332, + "learning_rate": 2.804259984322064e-05, + "loss": 2.8086, + "step": 4238000 + }, + { + "epoch": 1.3175994416872485, + "grad_norm": 12.142561912536621, + "learning_rate": 2.8040009305212528e-05, + "loss": 2.7957, + "step": 4238500 + }, + { + "epoch": 1.3177548739677354, + "grad_norm": 8.807642936706543, + "learning_rate": 2.803741876720441e-05, + "loss": 2.8147, + "step": 4239000 + }, + { + "epoch": 1.3179103062482223, + "grad_norm": 9.882445335388184, + "learning_rate": 2.80348282291963e-05, + "loss": 2.8281, + "step": 4239500 + }, + { + "epoch": 1.3180657385287091, + "grad_norm": 15.601997375488281, + "learning_rate": 2.8032237691188186e-05, + "loss": 2.8167, + "step": 4240000 + }, + { + "epoch": 1.318221170809196, + "grad_norm": 10.524483680725098, + "learning_rate": 2.8029647153180066e-05, + "loss": 2.8395, + "step": 4240500 + }, + { + "epoch": 1.3183766030896829, + "grad_norm": 8.306862831115723, + "learning_rate": 2.8027056615171953e-05, + "loss": 2.8104, + "step": 4241000 + }, + { + "epoch": 1.3185320353701697, + "grad_norm": 32.4226188659668, + "learning_rate": 2.802446607716384e-05, + "loss": 2.8183, + "step": 4241500 + }, + { + "epoch": 1.3186874676506566, + "grad_norm": 11.01070499420166, + "learning_rate": 2.8021875539155724e-05, + "loss": 2.8003, + "step": 4242000 + }, + { + "epoch": 1.3188428999311435, + "grad_norm": 10.682971954345703, + "learning_rate": 2.801928500114761e-05, + "loss": 2.8069, + "step": 4242500 + }, + { + "epoch": 1.3189983322116303, + "grad_norm": 9.679513931274414, + "learning_rate": 2.801669446313949e-05, + "loss": 2.8115, + "step": 4243000 + }, + { + "epoch": 1.3191537644921172, + "grad_norm": 11.616325378417969, + "learning_rate": 2.801410392513138e-05, + "loss": 2.8152, + "step": 4243500 + }, + { + "epoch": 1.319309196772604, + "grad_norm": 12.158623695373535, + "learning_rate": 2.8011513387123266e-05, + "loss": 2.8147, + "step": 4244000 + }, + { + "epoch": 1.319464629053091, + "grad_norm": 9.576438903808594, + "learning_rate": 2.800892284911515e-05, + "loss": 2.8556, + "step": 4244500 + }, + { + "epoch": 1.3196200613335778, + "grad_norm": 8.089942932128906, + "learning_rate": 2.8006332311107037e-05, + "loss": 2.7459, + "step": 4245000 + }, + { + "epoch": 1.3197754936140647, + "grad_norm": 9.340452194213867, + "learning_rate": 2.8003741773098924e-05, + "loss": 2.816, + "step": 4245500 + }, + { + "epoch": 1.3199309258945515, + "grad_norm": 8.794693946838379, + "learning_rate": 2.8001151235090804e-05, + "loss": 2.7951, + "step": 4246000 + }, + { + "epoch": 1.3200863581750384, + "grad_norm": 20.995208740234375, + "learning_rate": 2.799856069708269e-05, + "loss": 2.8233, + "step": 4246500 + }, + { + "epoch": 1.3202417904555253, + "grad_norm": 13.495207786560059, + "learning_rate": 2.7995970159074582e-05, + "loss": 2.8234, + "step": 4247000 + }, + { + "epoch": 1.3203972227360123, + "grad_norm": 10.020310401916504, + "learning_rate": 2.7993379621066462e-05, + "loss": 2.8674, + "step": 4247500 + }, + { + "epoch": 1.3205526550164992, + "grad_norm": 6.767462730407715, + "learning_rate": 2.799078908305835e-05, + "loss": 2.8173, + "step": 4248000 + }, + { + "epoch": 1.320708087296986, + "grad_norm": 63.21624755859375, + "learning_rate": 2.7988198545050237e-05, + "loss": 2.8209, + "step": 4248500 + }, + { + "epoch": 1.320863519577473, + "grad_norm": 9.040238380432129, + "learning_rate": 2.798560800704212e-05, + "loss": 2.7688, + "step": 4249000 + }, + { + "epoch": 1.3210189518579598, + "grad_norm": 12.11825180053711, + "learning_rate": 2.7983017469034008e-05, + "loss": 2.7975, + "step": 4249500 + }, + { + "epoch": 1.3211743841384467, + "grad_norm": 10.11253547668457, + "learning_rate": 2.7980426931025888e-05, + "loss": 2.7732, + "step": 4250000 + }, + { + "epoch": 1.3213298164189335, + "grad_norm": 9.14474105834961, + "learning_rate": 2.7977836393017775e-05, + "loss": 2.7929, + "step": 4250500 + }, + { + "epoch": 1.3214852486994204, + "grad_norm": 15.897112846374512, + "learning_rate": 2.7975245855009662e-05, + "loss": 2.7909, + "step": 4251000 + }, + { + "epoch": 1.3216406809799073, + "grad_norm": 12.912467002868652, + "learning_rate": 2.7972655317001546e-05, + "loss": 2.8268, + "step": 4251500 + }, + { + "epoch": 1.3217961132603941, + "grad_norm": 45.118282318115234, + "learning_rate": 2.7970064778993433e-05, + "loss": 2.8308, + "step": 4252000 + }, + { + "epoch": 1.321951545540881, + "grad_norm": 9.123542785644531, + "learning_rate": 2.796747424098532e-05, + "loss": 2.7624, + "step": 4252500 + }, + { + "epoch": 1.3221069778213679, + "grad_norm": 12.34085750579834, + "learning_rate": 2.79648837029772e-05, + "loss": 2.8212, + "step": 4253000 + }, + { + "epoch": 1.3222624101018547, + "grad_norm": 10.582118034362793, + "learning_rate": 2.7962293164969088e-05, + "loss": 2.8147, + "step": 4253500 + }, + { + "epoch": 1.3224178423823416, + "grad_norm": 19.979429244995117, + "learning_rate": 2.7959702626960975e-05, + "loss": 2.8014, + "step": 4254000 + }, + { + "epoch": 1.3225732746628285, + "grad_norm": 9.026575088500977, + "learning_rate": 2.795711208895286e-05, + "loss": 2.8948, + "step": 4254500 + }, + { + "epoch": 1.3227287069433153, + "grad_norm": 10.33326244354248, + "learning_rate": 2.7954521550944746e-05, + "loss": 2.7618, + "step": 4255000 + }, + { + "epoch": 1.3228841392238022, + "grad_norm": 9.167952537536621, + "learning_rate": 2.7951931012936626e-05, + "loss": 2.8089, + "step": 4255500 + }, + { + "epoch": 1.323039571504289, + "grad_norm": 9.372239112854004, + "learning_rate": 2.7949340474928514e-05, + "loss": 2.768, + "step": 4256000 + }, + { + "epoch": 1.323195003784776, + "grad_norm": 7.011509895324707, + "learning_rate": 2.79467499369204e-05, + "loss": 2.8204, + "step": 4256500 + }, + { + "epoch": 1.323350436065263, + "grad_norm": 8.073533058166504, + "learning_rate": 2.7944159398912284e-05, + "loss": 2.8283, + "step": 4257000 + }, + { + "epoch": 1.32350586834575, + "grad_norm": 8.737195014953613, + "learning_rate": 2.794156886090417e-05, + "loss": 2.8215, + "step": 4257500 + }, + { + "epoch": 1.3236613006262368, + "grad_norm": 11.919788360595703, + "learning_rate": 2.793897832289606e-05, + "loss": 2.8199, + "step": 4258000 + }, + { + "epoch": 1.3238167329067236, + "grad_norm": 9.534979820251465, + "learning_rate": 2.793638778488794e-05, + "loss": 2.7814, + "step": 4258500 + }, + { + "epoch": 1.3239721651872105, + "grad_norm": 7.700001239776611, + "learning_rate": 2.793379724687983e-05, + "loss": 2.8123, + "step": 4259000 + }, + { + "epoch": 1.3241275974676974, + "grad_norm": 11.273625373840332, + "learning_rate": 2.7931206708871717e-05, + "loss": 2.8023, + "step": 4259500 + }, + { + "epoch": 1.3242830297481842, + "grad_norm": 10.96107292175293, + "learning_rate": 2.7928616170863597e-05, + "loss": 2.8555, + "step": 4260000 + }, + { + "epoch": 1.324438462028671, + "grad_norm": 7.562317848205566, + "learning_rate": 2.7926025632855484e-05, + "loss": 2.8174, + "step": 4260500 + }, + { + "epoch": 1.324593894309158, + "grad_norm": 9.045220375061035, + "learning_rate": 2.792343509484737e-05, + "loss": 2.8409, + "step": 4261000 + }, + { + "epoch": 1.3247493265896448, + "grad_norm": 8.612618446350098, + "learning_rate": 2.7920844556839255e-05, + "loss": 2.8255, + "step": 4261500 + }, + { + "epoch": 1.3249047588701317, + "grad_norm": 7.457284927368164, + "learning_rate": 2.7918254018831142e-05, + "loss": 2.806, + "step": 4262000 + }, + { + "epoch": 1.3250601911506186, + "grad_norm": 8.67147159576416, + "learning_rate": 2.7915663480823023e-05, + "loss": 2.7906, + "step": 4262500 + }, + { + "epoch": 1.3252156234311054, + "grad_norm": 7.170127868652344, + "learning_rate": 2.791307294281491e-05, + "loss": 2.778, + "step": 4263000 + }, + { + "epoch": 1.3253710557115923, + "grad_norm": 8.729085922241211, + "learning_rate": 2.7910482404806797e-05, + "loss": 2.7963, + "step": 4263500 + }, + { + "epoch": 1.3255264879920792, + "grad_norm": 8.97397232055664, + "learning_rate": 2.790789186679868e-05, + "loss": 2.7948, + "step": 4264000 + }, + { + "epoch": 1.325681920272566, + "grad_norm": 6.494032382965088, + "learning_rate": 2.7905301328790568e-05, + "loss": 2.7994, + "step": 4264500 + }, + { + "epoch": 1.325837352553053, + "grad_norm": 21.235107421875, + "learning_rate": 2.7902710790782455e-05, + "loss": 2.8048, + "step": 4265000 + }, + { + "epoch": 1.3259927848335398, + "grad_norm": 10.332463264465332, + "learning_rate": 2.7900120252774336e-05, + "loss": 2.8374, + "step": 4265500 + }, + { + "epoch": 1.3261482171140266, + "grad_norm": 8.924992561340332, + "learning_rate": 2.7897529714766223e-05, + "loss": 2.8541, + "step": 4266000 + }, + { + "epoch": 1.3263036493945135, + "grad_norm": 18.11025047302246, + "learning_rate": 2.789493917675811e-05, + "loss": 2.8187, + "step": 4266500 + }, + { + "epoch": 1.3264590816750004, + "grad_norm": 9.08984661102295, + "learning_rate": 2.7892348638749994e-05, + "loss": 2.8337, + "step": 4267000 + }, + { + "epoch": 1.3266145139554872, + "grad_norm": 10.020441055297852, + "learning_rate": 2.788975810074188e-05, + "loss": 2.8276, + "step": 4267500 + }, + { + "epoch": 1.326769946235974, + "grad_norm": 15.395849227905273, + "learning_rate": 2.788716756273376e-05, + "loss": 2.7711, + "step": 4268000 + }, + { + "epoch": 1.326925378516461, + "grad_norm": 9.497334480285645, + "learning_rate": 2.788457702472565e-05, + "loss": 2.7954, + "step": 4268500 + }, + { + "epoch": 1.3270808107969478, + "grad_norm": 9.059309005737305, + "learning_rate": 2.788198648671754e-05, + "loss": 2.847, + "step": 4269000 + }, + { + "epoch": 1.3272362430774347, + "grad_norm": 8.665748596191406, + "learning_rate": 2.787939594870942e-05, + "loss": 2.7758, + "step": 4269500 + }, + { + "epoch": 1.3273916753579216, + "grad_norm": 9.655393600463867, + "learning_rate": 2.7876805410701306e-05, + "loss": 2.87, + "step": 4270000 + }, + { + "epoch": 1.3275471076384084, + "grad_norm": 7.942022800445557, + "learning_rate": 2.7874214872693194e-05, + "loss": 2.8125, + "step": 4270500 + }, + { + "epoch": 1.3277025399188953, + "grad_norm": 8.058886528015137, + "learning_rate": 2.7871624334685077e-05, + "loss": 2.7771, + "step": 4271000 + }, + { + "epoch": 1.3278579721993822, + "grad_norm": 10.197469711303711, + "learning_rate": 2.7869033796676965e-05, + "loss": 2.8502, + "step": 4271500 + }, + { + "epoch": 1.3280134044798693, + "grad_norm": 7.6341047286987305, + "learning_rate": 2.786644325866885e-05, + "loss": 2.7936, + "step": 4272000 + }, + { + "epoch": 1.3281688367603561, + "grad_norm": 9.336706161499023, + "learning_rate": 2.7863852720660732e-05, + "loss": 2.8033, + "step": 4272500 + }, + { + "epoch": 1.328324269040843, + "grad_norm": 9.240514755249023, + "learning_rate": 2.786126218265262e-05, + "loss": 2.8063, + "step": 4273000 + }, + { + "epoch": 1.3284797013213299, + "grad_norm": 11.618252754211426, + "learning_rate": 2.7858671644644503e-05, + "loss": 2.7995, + "step": 4273500 + }, + { + "epoch": 1.3286351336018167, + "grad_norm": 18.516468048095703, + "learning_rate": 2.785608110663639e-05, + "loss": 2.7955, + "step": 4274000 + }, + { + "epoch": 1.3287905658823036, + "grad_norm": 7.150315284729004, + "learning_rate": 2.7853490568628277e-05, + "loss": 2.7878, + "step": 4274500 + }, + { + "epoch": 1.3289459981627905, + "grad_norm": 8.536711692810059, + "learning_rate": 2.7850900030620158e-05, + "loss": 2.8212, + "step": 4275000 + }, + { + "epoch": 1.3291014304432773, + "grad_norm": 8.501127243041992, + "learning_rate": 2.7848309492612045e-05, + "loss": 2.7977, + "step": 4275500 + }, + { + "epoch": 1.3292568627237642, + "grad_norm": 10.322332382202148, + "learning_rate": 2.7845718954603932e-05, + "loss": 2.775, + "step": 4276000 + }, + { + "epoch": 1.329412295004251, + "grad_norm": 7.323704242706299, + "learning_rate": 2.7843128416595816e-05, + "loss": 2.8036, + "step": 4276500 + }, + { + "epoch": 1.329567727284738, + "grad_norm": 7.533858776092529, + "learning_rate": 2.7840537878587703e-05, + "loss": 2.8021, + "step": 4277000 + }, + { + "epoch": 1.3297231595652248, + "grad_norm": 6.718867301940918, + "learning_rate": 2.783794734057959e-05, + "loss": 2.796, + "step": 4277500 + }, + { + "epoch": 1.3298785918457117, + "grad_norm": 8.522860527038574, + "learning_rate": 2.783535680257147e-05, + "loss": 2.8258, + "step": 4278000 + }, + { + "epoch": 1.3300340241261985, + "grad_norm": 13.050968170166016, + "learning_rate": 2.7832766264563358e-05, + "loss": 2.7724, + "step": 4278500 + }, + { + "epoch": 1.3301894564066854, + "grad_norm": 19.140233993530273, + "learning_rate": 2.7830175726555248e-05, + "loss": 2.8034, + "step": 4279000 + }, + { + "epoch": 1.3303448886871723, + "grad_norm": 43.172786712646484, + "learning_rate": 2.782758518854713e-05, + "loss": 2.8104, + "step": 4279500 + }, + { + "epoch": 1.3305003209676591, + "grad_norm": 11.194939613342285, + "learning_rate": 2.7824994650539016e-05, + "loss": 2.7849, + "step": 4280000 + }, + { + "epoch": 1.330655753248146, + "grad_norm": 9.181412696838379, + "learning_rate": 2.7822404112530896e-05, + "loss": 2.8352, + "step": 4280500 + }, + { + "epoch": 1.330811185528633, + "grad_norm": 9.242228507995605, + "learning_rate": 2.7819813574522787e-05, + "loss": 2.8193, + "step": 4281000 + }, + { + "epoch": 1.33096661780912, + "grad_norm": 9.33366584777832, + "learning_rate": 2.7817223036514674e-05, + "loss": 2.8075, + "step": 4281500 + }, + { + "epoch": 1.3311220500896068, + "grad_norm": 15.995288848876953, + "learning_rate": 2.7814632498506554e-05, + "loss": 2.7892, + "step": 4282000 + }, + { + "epoch": 1.3312774823700937, + "grad_norm": 7.157470703125, + "learning_rate": 2.781204196049844e-05, + "loss": 2.8056, + "step": 4282500 + }, + { + "epoch": 1.3314329146505806, + "grad_norm": 10.559473037719727, + "learning_rate": 2.780945142249033e-05, + "loss": 2.7938, + "step": 4283000 + }, + { + "epoch": 1.3315883469310674, + "grad_norm": 7.041962146759033, + "learning_rate": 2.7806860884482212e-05, + "loss": 2.7834, + "step": 4283500 + }, + { + "epoch": 1.3317437792115543, + "grad_norm": 8.487964630126953, + "learning_rate": 2.78042703464741e-05, + "loss": 2.824, + "step": 4284000 + }, + { + "epoch": 1.3318992114920412, + "grad_norm": 9.06616497039795, + "learning_rate": 2.7801679808465986e-05, + "loss": 2.8295, + "step": 4284500 + }, + { + "epoch": 1.332054643772528, + "grad_norm": 9.618731498718262, + "learning_rate": 2.7799089270457867e-05, + "loss": 2.7964, + "step": 4285000 + }, + { + "epoch": 1.332210076053015, + "grad_norm": 10.820562362670898, + "learning_rate": 2.7796498732449754e-05, + "loss": 2.8151, + "step": 4285500 + }, + { + "epoch": 1.3323655083335018, + "grad_norm": 8.922747611999512, + "learning_rate": 2.7793908194441638e-05, + "loss": 2.8314, + "step": 4286000 + }, + { + "epoch": 1.3325209406139886, + "grad_norm": 8.478522300720215, + "learning_rate": 2.7791317656433525e-05, + "loss": 2.8267, + "step": 4286500 + }, + { + "epoch": 1.3326763728944755, + "grad_norm": 8.945989608764648, + "learning_rate": 2.7788727118425412e-05, + "loss": 2.8461, + "step": 4287000 + }, + { + "epoch": 1.3328318051749624, + "grad_norm": 33.38187026977539, + "learning_rate": 2.7786136580417292e-05, + "loss": 2.8133, + "step": 4287500 + }, + { + "epoch": 1.3329872374554492, + "grad_norm": 18.76862335205078, + "learning_rate": 2.778354604240918e-05, + "loss": 2.859, + "step": 4288000 + }, + { + "epoch": 1.333142669735936, + "grad_norm": 8.11611557006836, + "learning_rate": 2.7780955504401067e-05, + "loss": 2.8135, + "step": 4288500 + }, + { + "epoch": 1.333298102016423, + "grad_norm": 8.320319175720215, + "learning_rate": 2.777836496639295e-05, + "loss": 2.8119, + "step": 4289000 + }, + { + "epoch": 1.3334535342969098, + "grad_norm": 8.77120304107666, + "learning_rate": 2.7775774428384838e-05, + "loss": 2.8114, + "step": 4289500 + }, + { + "epoch": 1.3336089665773967, + "grad_norm": 8.64730453491211, + "learning_rate": 2.7773183890376725e-05, + "loss": 2.7965, + "step": 4290000 + }, + { + "epoch": 1.3337643988578836, + "grad_norm": 9.43348217010498, + "learning_rate": 2.7770593352368605e-05, + "loss": 2.8314, + "step": 4290500 + }, + { + "epoch": 1.3339198311383704, + "grad_norm": 12.269176483154297, + "learning_rate": 2.7768002814360496e-05, + "loss": 2.7882, + "step": 4291000 + }, + { + "epoch": 1.3340752634188573, + "grad_norm": 7.793875217437744, + "learning_rate": 2.7765412276352376e-05, + "loss": 2.8195, + "step": 4291500 + }, + { + "epoch": 1.3342306956993442, + "grad_norm": 8.353142738342285, + "learning_rate": 2.7762821738344263e-05, + "loss": 2.7913, + "step": 4292000 + }, + { + "epoch": 1.334386127979831, + "grad_norm": 11.57436466217041, + "learning_rate": 2.776023120033615e-05, + "loss": 2.8159, + "step": 4292500 + }, + { + "epoch": 1.334541560260318, + "grad_norm": 10.192419052124023, + "learning_rate": 2.7757640662328034e-05, + "loss": 2.8251, + "step": 4293000 + }, + { + "epoch": 1.3346969925408048, + "grad_norm": 8.575221061706543, + "learning_rate": 2.775505012431992e-05, + "loss": 2.8043, + "step": 4293500 + }, + { + "epoch": 1.3348524248212916, + "grad_norm": 8.136049270629883, + "learning_rate": 2.775245958631181e-05, + "loss": 2.7771, + "step": 4294000 + }, + { + "epoch": 1.3350078571017785, + "grad_norm": 16.1328182220459, + "learning_rate": 2.774986904830369e-05, + "loss": 2.8252, + "step": 4294500 + }, + { + "epoch": 1.3351632893822654, + "grad_norm": 8.624217987060547, + "learning_rate": 2.7747278510295576e-05, + "loss": 2.7706, + "step": 4295000 + }, + { + "epoch": 1.3353187216627522, + "grad_norm": 7.864494323730469, + "learning_rate": 2.7744687972287463e-05, + "loss": 2.7829, + "step": 4295500 + }, + { + "epoch": 1.3354741539432393, + "grad_norm": 7.985511779785156, + "learning_rate": 2.7742097434279347e-05, + "loss": 2.8514, + "step": 4296000 + }, + { + "epoch": 1.3356295862237262, + "grad_norm": 15.784975051879883, + "learning_rate": 2.7739506896271234e-05, + "loss": 2.7814, + "step": 4296500 + }, + { + "epoch": 1.335785018504213, + "grad_norm": 8.903947830200195, + "learning_rate": 2.773691635826312e-05, + "loss": 2.8137, + "step": 4297000 + }, + { + "epoch": 1.3359404507847, + "grad_norm": 7.611512660980225, + "learning_rate": 2.7734325820255e-05, + "loss": 2.8382, + "step": 4297500 + }, + { + "epoch": 1.3360958830651868, + "grad_norm": 12.00638198852539, + "learning_rate": 2.773173528224689e-05, + "loss": 2.8309, + "step": 4298000 + }, + { + "epoch": 1.3362513153456737, + "grad_norm": 12.127983093261719, + "learning_rate": 2.7729144744238773e-05, + "loss": 2.7867, + "step": 4298500 + }, + { + "epoch": 1.3364067476261605, + "grad_norm": 8.593988418579102, + "learning_rate": 2.772655420623066e-05, + "loss": 2.8258, + "step": 4299000 + }, + { + "epoch": 1.3365621799066474, + "grad_norm": 8.64961051940918, + "learning_rate": 2.7723963668222547e-05, + "loss": 2.8329, + "step": 4299500 + }, + { + "epoch": 1.3367176121871343, + "grad_norm": 7.367046356201172, + "learning_rate": 2.7721373130214427e-05, + "loss": 2.7883, + "step": 4300000 + }, + { + "epoch": 1.3368730444676211, + "grad_norm": 10.01401138305664, + "learning_rate": 2.7718782592206314e-05, + "loss": 2.8459, + "step": 4300500 + }, + { + "epoch": 1.337028476748108, + "grad_norm": 10.645508766174316, + "learning_rate": 2.7716192054198205e-05, + "loss": 2.8172, + "step": 4301000 + }, + { + "epoch": 1.3371839090285949, + "grad_norm": 9.919354438781738, + "learning_rate": 2.7713601516190085e-05, + "loss": 2.8356, + "step": 4301500 + }, + { + "epoch": 1.3373393413090817, + "grad_norm": 9.70582389831543, + "learning_rate": 2.7711010978181972e-05, + "loss": 2.7991, + "step": 4302000 + }, + { + "epoch": 1.3374947735895686, + "grad_norm": 9.56097412109375, + "learning_rate": 2.770842044017386e-05, + "loss": 2.8257, + "step": 4302500 + }, + { + "epoch": 1.3376502058700555, + "grad_norm": 8.273656845092773, + "learning_rate": 2.7705829902165743e-05, + "loss": 2.8462, + "step": 4303000 + }, + { + "epoch": 1.3378056381505423, + "grad_norm": 10.098939895629883, + "learning_rate": 2.770323936415763e-05, + "loss": 2.7892, + "step": 4303500 + }, + { + "epoch": 1.3379610704310292, + "grad_norm": 9.863287925720215, + "learning_rate": 2.770064882614951e-05, + "loss": 2.8095, + "step": 4304000 + }, + { + "epoch": 1.338116502711516, + "grad_norm": 8.139185905456543, + "learning_rate": 2.7698058288141398e-05, + "loss": 2.8165, + "step": 4304500 + }, + { + "epoch": 1.3382719349920031, + "grad_norm": 8.56108283996582, + "learning_rate": 2.7695467750133285e-05, + "loss": 2.7693, + "step": 4305000 + }, + { + "epoch": 1.33842736727249, + "grad_norm": 10.399855613708496, + "learning_rate": 2.769287721212517e-05, + "loss": 2.8015, + "step": 4305500 + }, + { + "epoch": 1.3385827995529769, + "grad_norm": 10.519743919372559, + "learning_rate": 2.7690286674117056e-05, + "loss": 2.8235, + "step": 4306000 + }, + { + "epoch": 1.3387382318334637, + "grad_norm": 11.213105201721191, + "learning_rate": 2.7687696136108943e-05, + "loss": 2.8316, + "step": 4306500 + }, + { + "epoch": 1.3388936641139506, + "grad_norm": 10.035384178161621, + "learning_rate": 2.7685105598100824e-05, + "loss": 2.8442, + "step": 4307000 + }, + { + "epoch": 1.3390490963944375, + "grad_norm": 9.740561485290527, + "learning_rate": 2.768251506009271e-05, + "loss": 2.8221, + "step": 4307500 + }, + { + "epoch": 1.3392045286749243, + "grad_norm": 13.426712989807129, + "learning_rate": 2.7679924522084598e-05, + "loss": 2.8071, + "step": 4308000 + }, + { + "epoch": 1.3393599609554112, + "grad_norm": 12.504658699035645, + "learning_rate": 2.7677333984076482e-05, + "loss": 2.7866, + "step": 4308500 + }, + { + "epoch": 1.339515393235898, + "grad_norm": 8.984999656677246, + "learning_rate": 2.767474344606837e-05, + "loss": 2.8099, + "step": 4309000 + }, + { + "epoch": 1.339670825516385, + "grad_norm": 20.48618507385254, + "learning_rate": 2.767215290806025e-05, + "loss": 2.8539, + "step": 4309500 + }, + { + "epoch": 1.3398262577968718, + "grad_norm": 11.057222366333008, + "learning_rate": 2.7669562370052136e-05, + "loss": 2.8481, + "step": 4310000 + }, + { + "epoch": 1.3399816900773587, + "grad_norm": 12.81786060333252, + "learning_rate": 2.7666971832044024e-05, + "loss": 2.7777, + "step": 4310500 + }, + { + "epoch": 1.3401371223578455, + "grad_norm": 10.904480934143066, + "learning_rate": 2.7664381294035907e-05, + "loss": 2.8043, + "step": 4311000 + }, + { + "epoch": 1.3402925546383324, + "grad_norm": 8.67243766784668, + "learning_rate": 2.7661790756027794e-05, + "loss": 2.8285, + "step": 4311500 + }, + { + "epoch": 1.3404479869188193, + "grad_norm": 9.8673095703125, + "learning_rate": 2.765920021801968e-05, + "loss": 2.7712, + "step": 4312000 + }, + { + "epoch": 1.3406034191993061, + "grad_norm": 8.82345962524414, + "learning_rate": 2.7656609680011565e-05, + "loss": 2.7871, + "step": 4312500 + }, + { + "epoch": 1.340758851479793, + "grad_norm": 8.926363945007324, + "learning_rate": 2.7654019142003453e-05, + "loss": 2.8605, + "step": 4313000 + }, + { + "epoch": 1.3409142837602799, + "grad_norm": 23.815797805786133, + "learning_rate": 2.765142860399534e-05, + "loss": 2.8183, + "step": 4313500 + }, + { + "epoch": 1.3410697160407667, + "grad_norm": 8.495173454284668, + "learning_rate": 2.764883806598722e-05, + "loss": 2.8394, + "step": 4314000 + }, + { + "epoch": 1.3412251483212536, + "grad_norm": 13.640304565429688, + "learning_rate": 2.7646247527979107e-05, + "loss": 2.8131, + "step": 4314500 + }, + { + "epoch": 1.3413805806017405, + "grad_norm": 10.092106819152832, + "learning_rate": 2.7643656989970994e-05, + "loss": 2.7985, + "step": 4315000 + }, + { + "epoch": 1.3415360128822273, + "grad_norm": 8.080842971801758, + "learning_rate": 2.7641066451962878e-05, + "loss": 2.8683, + "step": 4315500 + }, + { + "epoch": 1.3416914451627142, + "grad_norm": 19.925708770751953, + "learning_rate": 2.7638475913954765e-05, + "loss": 2.8081, + "step": 4316000 + }, + { + "epoch": 1.341846877443201, + "grad_norm": 9.677642822265625, + "learning_rate": 2.7635885375946646e-05, + "loss": 2.8573, + "step": 4316500 + }, + { + "epoch": 1.342002309723688, + "grad_norm": 8.789156913757324, + "learning_rate": 2.7633294837938533e-05, + "loss": 2.7798, + "step": 4317000 + }, + { + "epoch": 1.3421577420041748, + "grad_norm": 37.14491271972656, + "learning_rate": 2.763070429993042e-05, + "loss": 2.7812, + "step": 4317500 + }, + { + "epoch": 1.3423131742846617, + "grad_norm": 9.62828254699707, + "learning_rate": 2.7628113761922304e-05, + "loss": 2.8041, + "step": 4318000 + }, + { + "epoch": 1.3424686065651485, + "grad_norm": 10.428988456726074, + "learning_rate": 2.762552322391419e-05, + "loss": 2.7816, + "step": 4318500 + }, + { + "epoch": 1.3426240388456354, + "grad_norm": 10.155138969421387, + "learning_rate": 2.7622932685906078e-05, + "loss": 2.8308, + "step": 4319000 + }, + { + "epoch": 1.3427794711261223, + "grad_norm": 9.405195236206055, + "learning_rate": 2.762034214789796e-05, + "loss": 2.8279, + "step": 4319500 + }, + { + "epoch": 1.3429349034066094, + "grad_norm": 8.358577728271484, + "learning_rate": 2.7617751609889846e-05, + "loss": 2.7826, + "step": 4320000 + }, + { + "epoch": 1.3430903356870962, + "grad_norm": 10.25827693939209, + "learning_rate": 2.7615161071881733e-05, + "loss": 2.8052, + "step": 4320500 + }, + { + "epoch": 1.343245767967583, + "grad_norm": 10.186942100524902, + "learning_rate": 2.7612570533873617e-05, + "loss": 2.7987, + "step": 4321000 + }, + { + "epoch": 1.34340120024807, + "grad_norm": 8.951530456542969, + "learning_rate": 2.7609979995865504e-05, + "loss": 2.8088, + "step": 4321500 + }, + { + "epoch": 1.3435566325285568, + "grad_norm": 8.777172088623047, + "learning_rate": 2.7607389457857384e-05, + "loss": 2.8654, + "step": 4322000 + }, + { + "epoch": 1.3437120648090437, + "grad_norm": 8.840458869934082, + "learning_rate": 2.7604798919849275e-05, + "loss": 2.8124, + "step": 4322500 + }, + { + "epoch": 1.3438674970895306, + "grad_norm": 11.057960510253906, + "learning_rate": 2.7602208381841162e-05, + "loss": 2.8636, + "step": 4323000 + }, + { + "epoch": 1.3440229293700174, + "grad_norm": 8.29940414428711, + "learning_rate": 2.7599617843833042e-05, + "loss": 2.7942, + "step": 4323500 + }, + { + "epoch": 1.3441783616505043, + "grad_norm": 9.954084396362305, + "learning_rate": 2.759702730582493e-05, + "loss": 2.7695, + "step": 4324000 + }, + { + "epoch": 1.3443337939309912, + "grad_norm": 9.912012100219727, + "learning_rate": 2.7594436767816816e-05, + "loss": 2.7796, + "step": 4324500 + }, + { + "epoch": 1.344489226211478, + "grad_norm": 8.319746971130371, + "learning_rate": 2.75918462298087e-05, + "loss": 2.7549, + "step": 4325000 + }, + { + "epoch": 1.344644658491965, + "grad_norm": 9.561713218688965, + "learning_rate": 2.7589255691800587e-05, + "loss": 2.8107, + "step": 4325500 + }, + { + "epoch": 1.3448000907724518, + "grad_norm": 16.135848999023438, + "learning_rate": 2.7586665153792474e-05, + "loss": 2.7599, + "step": 4326000 + }, + { + "epoch": 1.3449555230529386, + "grad_norm": 10.739371299743652, + "learning_rate": 2.7584074615784355e-05, + "loss": 2.7983, + "step": 4326500 + }, + { + "epoch": 1.3451109553334255, + "grad_norm": 10.38913345336914, + "learning_rate": 2.7581484077776242e-05, + "loss": 2.8295, + "step": 4327000 + }, + { + "epoch": 1.3452663876139124, + "grad_norm": 10.029374122619629, + "learning_rate": 2.7578893539768126e-05, + "loss": 2.807, + "step": 4327500 + }, + { + "epoch": 1.3454218198943992, + "grad_norm": 75.7383804321289, + "learning_rate": 2.7576303001760013e-05, + "loss": 2.8323, + "step": 4328000 + }, + { + "epoch": 1.345577252174886, + "grad_norm": 11.81347370147705, + "learning_rate": 2.75737124637519e-05, + "loss": 2.8139, + "step": 4328500 + }, + { + "epoch": 1.3457326844553732, + "grad_norm": 8.47840404510498, + "learning_rate": 2.757112192574378e-05, + "loss": 2.8212, + "step": 4329000 + }, + { + "epoch": 1.34588811673586, + "grad_norm": 9.899946212768555, + "learning_rate": 2.7568531387735668e-05, + "loss": 2.8274, + "step": 4329500 + }, + { + "epoch": 1.346043549016347, + "grad_norm": 11.057470321655273, + "learning_rate": 2.7565940849727555e-05, + "loss": 2.7721, + "step": 4330000 + }, + { + "epoch": 1.3461989812968338, + "grad_norm": 8.01268482208252, + "learning_rate": 2.756335031171944e-05, + "loss": 2.804, + "step": 4330500 + }, + { + "epoch": 1.3463544135773207, + "grad_norm": 11.362696647644043, + "learning_rate": 2.7560759773711326e-05, + "loss": 2.7959, + "step": 4331000 + }, + { + "epoch": 1.3465098458578075, + "grad_norm": 11.9631929397583, + "learning_rate": 2.7558169235703213e-05, + "loss": 2.829, + "step": 4331500 + }, + { + "epoch": 1.3466652781382944, + "grad_norm": 9.519179344177246, + "learning_rate": 2.7555578697695093e-05, + "loss": 2.8239, + "step": 4332000 + }, + { + "epoch": 1.3468207104187813, + "grad_norm": 15.841750144958496, + "learning_rate": 2.7552988159686984e-05, + "loss": 2.7812, + "step": 4332500 + }, + { + "epoch": 1.3469761426992681, + "grad_norm": 8.88288402557373, + "learning_rate": 2.755039762167887e-05, + "loss": 2.8482, + "step": 4333000 + }, + { + "epoch": 1.347131574979755, + "grad_norm": 9.499544143676758, + "learning_rate": 2.754780708367075e-05, + "loss": 2.7853, + "step": 4333500 + }, + { + "epoch": 1.3472870072602419, + "grad_norm": 12.076677322387695, + "learning_rate": 2.754521654566264e-05, + "loss": 2.7916, + "step": 4334000 + }, + { + "epoch": 1.3474424395407287, + "grad_norm": 8.489877700805664, + "learning_rate": 2.7542626007654522e-05, + "loss": 2.7964, + "step": 4334500 + }, + { + "epoch": 1.3475978718212156, + "grad_norm": 10.977975845336914, + "learning_rate": 2.754003546964641e-05, + "loss": 2.7985, + "step": 4335000 + }, + { + "epoch": 1.3477533041017025, + "grad_norm": 7.884479999542236, + "learning_rate": 2.7537444931638297e-05, + "loss": 2.7817, + "step": 4335500 + }, + { + "epoch": 1.3479087363821893, + "grad_norm": 10.77899169921875, + "learning_rate": 2.7534854393630177e-05, + "loss": 2.7924, + "step": 4336000 + }, + { + "epoch": 1.3480641686626762, + "grad_norm": 13.345934867858887, + "learning_rate": 2.7532263855622064e-05, + "loss": 2.7989, + "step": 4336500 + }, + { + "epoch": 1.348219600943163, + "grad_norm": 14.142314910888672, + "learning_rate": 2.752967331761395e-05, + "loss": 2.811, + "step": 4337000 + }, + { + "epoch": 1.34837503322365, + "grad_norm": 8.6904878616333, + "learning_rate": 2.7527082779605835e-05, + "loss": 2.8474, + "step": 4337500 + }, + { + "epoch": 1.3485304655041368, + "grad_norm": 10.168170928955078, + "learning_rate": 2.7524492241597722e-05, + "loss": 2.7801, + "step": 4338000 + }, + { + "epoch": 1.3486858977846237, + "grad_norm": 8.544937133789062, + "learning_rate": 2.752190170358961e-05, + "loss": 2.7878, + "step": 4338500 + }, + { + "epoch": 1.3488413300651105, + "grad_norm": 10.420306205749512, + "learning_rate": 2.751931116558149e-05, + "loss": 2.7938, + "step": 4339000 + }, + { + "epoch": 1.3489967623455974, + "grad_norm": 8.106666564941406, + "learning_rate": 2.7516720627573377e-05, + "loss": 2.8329, + "step": 4339500 + }, + { + "epoch": 1.3491521946260843, + "grad_norm": 7.985296726226807, + "learning_rate": 2.751413008956526e-05, + "loss": 2.8553, + "step": 4340000 + }, + { + "epoch": 1.3493076269065711, + "grad_norm": 8.042737007141113, + "learning_rate": 2.7511539551557148e-05, + "loss": 2.7586, + "step": 4340500 + }, + { + "epoch": 1.349463059187058, + "grad_norm": 7.791204452514648, + "learning_rate": 2.7508949013549035e-05, + "loss": 2.8132, + "step": 4341000 + }, + { + "epoch": 1.3496184914675449, + "grad_norm": 9.169205665588379, + "learning_rate": 2.7506358475540915e-05, + "loss": 2.7567, + "step": 4341500 + }, + { + "epoch": 1.3497739237480317, + "grad_norm": 34.43048858642578, + "learning_rate": 2.7503767937532802e-05, + "loss": 2.8048, + "step": 4342000 + }, + { + "epoch": 1.3499293560285186, + "grad_norm": 49.3509407043457, + "learning_rate": 2.7501177399524693e-05, + "loss": 2.8384, + "step": 4342500 + }, + { + "epoch": 1.3500847883090055, + "grad_norm": 13.727352142333984, + "learning_rate": 2.7498586861516573e-05, + "loss": 2.7841, + "step": 4343000 + }, + { + "epoch": 1.3502402205894923, + "grad_norm": 10.609845161437988, + "learning_rate": 2.749599632350846e-05, + "loss": 2.8101, + "step": 4343500 + }, + { + "epoch": 1.3503956528699794, + "grad_norm": 9.077472686767578, + "learning_rate": 2.7493405785500348e-05, + "loss": 2.8367, + "step": 4344000 + }, + { + "epoch": 1.3505510851504663, + "grad_norm": 10.372476577758789, + "learning_rate": 2.749081524749223e-05, + "loss": 2.8137, + "step": 4344500 + }, + { + "epoch": 1.3507065174309532, + "grad_norm": 9.066864013671875, + "learning_rate": 2.748822470948412e-05, + "loss": 2.8253, + "step": 4345000 + }, + { + "epoch": 1.35086194971144, + "grad_norm": 9.207106590270996, + "learning_rate": 2.7485634171476e-05, + "loss": 2.786, + "step": 4345500 + }, + { + "epoch": 1.351017381991927, + "grad_norm": 6.99215030670166, + "learning_rate": 2.7483043633467886e-05, + "loss": 2.8037, + "step": 4346000 + }, + { + "epoch": 1.3511728142724138, + "grad_norm": 8.08045482635498, + "learning_rate": 2.7480453095459773e-05, + "loss": 2.867, + "step": 4346500 + }, + { + "epoch": 1.3513282465529006, + "grad_norm": 10.976676940917969, + "learning_rate": 2.7477862557451657e-05, + "loss": 2.7944, + "step": 4347000 + }, + { + "epoch": 1.3514836788333875, + "grad_norm": 11.552818298339844, + "learning_rate": 2.7475272019443544e-05, + "loss": 2.8194, + "step": 4347500 + }, + { + "epoch": 1.3516391111138744, + "grad_norm": 10.813377380371094, + "learning_rate": 2.747268148143543e-05, + "loss": 2.8536, + "step": 4348000 + }, + { + "epoch": 1.3517945433943612, + "grad_norm": 3.777798891067505, + "learning_rate": 2.747009094342731e-05, + "loss": 2.8089, + "step": 4348500 + }, + { + "epoch": 1.351949975674848, + "grad_norm": 7.67891263961792, + "learning_rate": 2.74675004054192e-05, + "loss": 2.8476, + "step": 4349000 + }, + { + "epoch": 1.352105407955335, + "grad_norm": 10.906489372253418, + "learning_rate": 2.7464909867411086e-05, + "loss": 2.7773, + "step": 4349500 + }, + { + "epoch": 1.3522608402358218, + "grad_norm": 8.902979850769043, + "learning_rate": 2.746231932940297e-05, + "loss": 2.8277, + "step": 4350000 + }, + { + "epoch": 1.3524162725163087, + "grad_norm": 12.766379356384277, + "learning_rate": 2.7459728791394857e-05, + "loss": 2.847, + "step": 4350500 + }, + { + "epoch": 1.3525717047967956, + "grad_norm": 8.482848167419434, + "learning_rate": 2.7457138253386744e-05, + "loss": 2.7903, + "step": 4351000 + }, + { + "epoch": 1.3527271370772824, + "grad_norm": 8.718551635742188, + "learning_rate": 2.7454547715378624e-05, + "loss": 2.8123, + "step": 4351500 + }, + { + "epoch": 1.3528825693577693, + "grad_norm": 7.658156394958496, + "learning_rate": 2.745195717737051e-05, + "loss": 2.868, + "step": 4352000 + }, + { + "epoch": 1.3530380016382562, + "grad_norm": 8.500235557556152, + "learning_rate": 2.7449366639362395e-05, + "loss": 2.8072, + "step": 4352500 + }, + { + "epoch": 1.3531934339187432, + "grad_norm": 7.202610015869141, + "learning_rate": 2.7446776101354283e-05, + "loss": 2.8449, + "step": 4353000 + }, + { + "epoch": 1.3533488661992301, + "grad_norm": 17.2629451751709, + "learning_rate": 2.744418556334617e-05, + "loss": 2.8309, + "step": 4353500 + }, + { + "epoch": 1.353504298479717, + "grad_norm": 16.233325958251953, + "learning_rate": 2.744159502533805e-05, + "loss": 2.8112, + "step": 4354000 + }, + { + "epoch": 1.3536597307602038, + "grad_norm": 10.067059516906738, + "learning_rate": 2.743900448732994e-05, + "loss": 2.8094, + "step": 4354500 + }, + { + "epoch": 1.3538151630406907, + "grad_norm": 9.218438148498535, + "learning_rate": 2.7436413949321828e-05, + "loss": 2.7836, + "step": 4355000 + }, + { + "epoch": 1.3539705953211776, + "grad_norm": 13.523210525512695, + "learning_rate": 2.7433823411313708e-05, + "loss": 2.782, + "step": 4355500 + }, + { + "epoch": 1.3541260276016645, + "grad_norm": 8.627674102783203, + "learning_rate": 2.7431232873305595e-05, + "loss": 2.7664, + "step": 4356000 + }, + { + "epoch": 1.3542814598821513, + "grad_norm": 6.298739433288574, + "learning_rate": 2.7428642335297482e-05, + "loss": 2.7908, + "step": 4356500 + }, + { + "epoch": 1.3544368921626382, + "grad_norm": 9.33495044708252, + "learning_rate": 2.7426051797289366e-05, + "loss": 2.8018, + "step": 4357000 + }, + { + "epoch": 1.354592324443125, + "grad_norm": 8.090333938598633, + "learning_rate": 2.7423461259281253e-05, + "loss": 2.7869, + "step": 4357500 + }, + { + "epoch": 1.354747756723612, + "grad_norm": 10.316727638244629, + "learning_rate": 2.7420870721273134e-05, + "loss": 2.8258, + "step": 4358000 + }, + { + "epoch": 1.3549031890040988, + "grad_norm": 11.453189849853516, + "learning_rate": 2.741828018326502e-05, + "loss": 2.8105, + "step": 4358500 + }, + { + "epoch": 1.3550586212845857, + "grad_norm": 8.56650161743164, + "learning_rate": 2.7415689645256908e-05, + "loss": 2.7745, + "step": 4359000 + }, + { + "epoch": 1.3552140535650725, + "grad_norm": 8.908783912658691, + "learning_rate": 2.7413099107248792e-05, + "loss": 2.8073, + "step": 4359500 + }, + { + "epoch": 1.3553694858455594, + "grad_norm": 9.339035987854004, + "learning_rate": 2.741050856924068e-05, + "loss": 2.8036, + "step": 4360000 + }, + { + "epoch": 1.3555249181260463, + "grad_norm": 10.89275074005127, + "learning_rate": 2.7407918031232566e-05, + "loss": 2.8292, + "step": 4360500 + }, + { + "epoch": 1.3556803504065331, + "grad_norm": 17.238306045532227, + "learning_rate": 2.7405327493224446e-05, + "loss": 2.7958, + "step": 4361000 + }, + { + "epoch": 1.35583578268702, + "grad_norm": 9.360299110412598, + "learning_rate": 2.7402736955216334e-05, + "loss": 2.8167, + "step": 4361500 + }, + { + "epoch": 1.3559912149675069, + "grad_norm": 9.843851089477539, + "learning_rate": 2.740014641720822e-05, + "loss": 2.8197, + "step": 4362000 + }, + { + "epoch": 1.3561466472479937, + "grad_norm": 9.432785987854004, + "learning_rate": 2.7397555879200105e-05, + "loss": 2.8405, + "step": 4362500 + }, + { + "epoch": 1.3563020795284806, + "grad_norm": 9.372529983520508, + "learning_rate": 2.7394965341191992e-05, + "loss": 2.779, + "step": 4363000 + }, + { + "epoch": 1.3564575118089675, + "grad_norm": 6.628446578979492, + "learning_rate": 2.7392374803183872e-05, + "loss": 2.8086, + "step": 4363500 + }, + { + "epoch": 1.3566129440894543, + "grad_norm": 8.887470245361328, + "learning_rate": 2.738978426517576e-05, + "loss": 2.7584, + "step": 4364000 + }, + { + "epoch": 1.3567683763699412, + "grad_norm": 8.667348861694336, + "learning_rate": 2.738719372716765e-05, + "loss": 2.7741, + "step": 4364500 + }, + { + "epoch": 1.356923808650428, + "grad_norm": 12.09284782409668, + "learning_rate": 2.738460318915953e-05, + "loss": 2.7919, + "step": 4365000 + }, + { + "epoch": 1.357079240930915, + "grad_norm": 9.40685749053955, + "learning_rate": 2.7382012651151417e-05, + "loss": 2.8424, + "step": 4365500 + }, + { + "epoch": 1.3572346732114018, + "grad_norm": 8.38431453704834, + "learning_rate": 2.7379422113143304e-05, + "loss": 2.7786, + "step": 4366000 + }, + { + "epoch": 1.3573901054918887, + "grad_norm": 7.694559097290039, + "learning_rate": 2.7376831575135188e-05, + "loss": 2.8237, + "step": 4366500 + }, + { + "epoch": 1.3575455377723755, + "grad_norm": 16.457014083862305, + "learning_rate": 2.7374241037127075e-05, + "loss": 2.852, + "step": 4367000 + }, + { + "epoch": 1.3577009700528624, + "grad_norm": 6.501052379608154, + "learning_rate": 2.7371650499118963e-05, + "loss": 2.7905, + "step": 4367500 + }, + { + "epoch": 1.3578564023333495, + "grad_norm": 8.760849952697754, + "learning_rate": 2.7369059961110843e-05, + "loss": 2.8043, + "step": 4368000 + }, + { + "epoch": 1.3580118346138363, + "grad_norm": 7.118179798126221, + "learning_rate": 2.736646942310273e-05, + "loss": 2.8084, + "step": 4368500 + }, + { + "epoch": 1.3581672668943232, + "grad_norm": 9.75349235534668, + "learning_rate": 2.7363878885094617e-05, + "loss": 2.7963, + "step": 4369000 + }, + { + "epoch": 1.35832269917481, + "grad_norm": 9.646986961364746, + "learning_rate": 2.73612883470865e-05, + "loss": 2.8652, + "step": 4369500 + }, + { + "epoch": 1.358478131455297, + "grad_norm": 7.622162818908691, + "learning_rate": 2.7358697809078388e-05, + "loss": 2.8729, + "step": 4370000 + }, + { + "epoch": 1.3586335637357838, + "grad_norm": 8.882376670837402, + "learning_rate": 2.735610727107027e-05, + "loss": 2.7688, + "step": 4370500 + }, + { + "epoch": 1.3587889960162707, + "grad_norm": 9.610286712646484, + "learning_rate": 2.7353516733062156e-05, + "loss": 2.7975, + "step": 4371000 + }, + { + "epoch": 1.3589444282967575, + "grad_norm": 29.17272186279297, + "learning_rate": 2.7350926195054043e-05, + "loss": 2.7589, + "step": 4371500 + }, + { + "epoch": 1.3590998605772444, + "grad_norm": 10.792265892028809, + "learning_rate": 2.7348335657045927e-05, + "loss": 2.7838, + "step": 4372000 + }, + { + "epoch": 1.3592552928577313, + "grad_norm": 9.479751586914062, + "learning_rate": 2.7345745119037814e-05, + "loss": 2.8167, + "step": 4372500 + }, + { + "epoch": 1.3594107251382181, + "grad_norm": 10.171975135803223, + "learning_rate": 2.73431545810297e-05, + "loss": 2.7669, + "step": 4373000 + }, + { + "epoch": 1.359566157418705, + "grad_norm": 8.061612129211426, + "learning_rate": 2.734056404302158e-05, + "loss": 2.8354, + "step": 4373500 + }, + { + "epoch": 1.3597215896991919, + "grad_norm": 10.16324234008789, + "learning_rate": 2.733797350501347e-05, + "loss": 2.7934, + "step": 4374000 + }, + { + "epoch": 1.3598770219796787, + "grad_norm": 8.64631462097168, + "learning_rate": 2.733538296700536e-05, + "loss": 2.7921, + "step": 4374500 + }, + { + "epoch": 1.3600324542601656, + "grad_norm": 8.078301429748535, + "learning_rate": 2.733279242899724e-05, + "loss": 2.7716, + "step": 4375000 + }, + { + "epoch": 1.3601878865406525, + "grad_norm": 9.562698364257812, + "learning_rate": 2.7330201890989126e-05, + "loss": 2.7896, + "step": 4375500 + }, + { + "epoch": 1.3603433188211393, + "grad_norm": 7.867651462554932, + "learning_rate": 2.7327611352981007e-05, + "loss": 2.8001, + "step": 4376000 + }, + { + "epoch": 1.3604987511016262, + "grad_norm": 10.691628456115723, + "learning_rate": 2.7325020814972897e-05, + "loss": 2.8797, + "step": 4376500 + }, + { + "epoch": 1.3606541833821133, + "grad_norm": 9.534823417663574, + "learning_rate": 2.7322430276964785e-05, + "loss": 2.8013, + "step": 4377000 + }, + { + "epoch": 1.3608096156626002, + "grad_norm": 8.75854778289795, + "learning_rate": 2.7319839738956665e-05, + "loss": 2.7979, + "step": 4377500 + }, + { + "epoch": 1.360965047943087, + "grad_norm": 9.951037406921387, + "learning_rate": 2.7317249200948552e-05, + "loss": 2.7597, + "step": 4378000 + }, + { + "epoch": 1.361120480223574, + "grad_norm": 15.240657806396484, + "learning_rate": 2.731465866294044e-05, + "loss": 2.7995, + "step": 4378500 + }, + { + "epoch": 1.3612759125040608, + "grad_norm": 13.033146858215332, + "learning_rate": 2.7312068124932323e-05, + "loss": 2.8355, + "step": 4379000 + }, + { + "epoch": 1.3614313447845476, + "grad_norm": 9.784414291381836, + "learning_rate": 2.730947758692421e-05, + "loss": 2.7596, + "step": 4379500 + }, + { + "epoch": 1.3615867770650345, + "grad_norm": 10.107039451599121, + "learning_rate": 2.7306887048916097e-05, + "loss": 2.8179, + "step": 4380000 + }, + { + "epoch": 1.3617422093455214, + "grad_norm": 7.507584095001221, + "learning_rate": 2.7304296510907978e-05, + "loss": 2.8151, + "step": 4380500 + }, + { + "epoch": 1.3618976416260082, + "grad_norm": 11.521833419799805, + "learning_rate": 2.7301705972899865e-05, + "loss": 2.8353, + "step": 4381000 + }, + { + "epoch": 1.362053073906495, + "grad_norm": 16.051734924316406, + "learning_rate": 2.7299115434891752e-05, + "loss": 2.8212, + "step": 4381500 + }, + { + "epoch": 1.362208506186982, + "grad_norm": 7.834603309631348, + "learning_rate": 2.7296524896883636e-05, + "loss": 2.8287, + "step": 4382000 + }, + { + "epoch": 1.3623639384674688, + "grad_norm": 10.232629776000977, + "learning_rate": 2.7293934358875523e-05, + "loss": 2.7941, + "step": 4382500 + }, + { + "epoch": 1.3625193707479557, + "grad_norm": 7.209652423858643, + "learning_rate": 2.7291343820867403e-05, + "loss": 2.7875, + "step": 4383000 + }, + { + "epoch": 1.3626748030284426, + "grad_norm": 8.273189544677734, + "learning_rate": 2.728875328285929e-05, + "loss": 2.7895, + "step": 4383500 + }, + { + "epoch": 1.3628302353089294, + "grad_norm": 9.286230087280273, + "learning_rate": 2.7286162744851178e-05, + "loss": 2.8062, + "step": 4384000 + }, + { + "epoch": 1.3629856675894163, + "grad_norm": 8.60738468170166, + "learning_rate": 2.728357220684306e-05, + "loss": 2.7729, + "step": 4384500 + }, + { + "epoch": 1.3631410998699032, + "grad_norm": 8.329716682434082, + "learning_rate": 2.728098166883495e-05, + "loss": 2.8337, + "step": 4385000 + }, + { + "epoch": 1.36329653215039, + "grad_norm": 10.698822975158691, + "learning_rate": 2.7278391130826836e-05, + "loss": 2.8088, + "step": 4385500 + }, + { + "epoch": 1.363451964430877, + "grad_norm": 7.699779033660889, + "learning_rate": 2.7275800592818716e-05, + "loss": 2.8247, + "step": 4386000 + }, + { + "epoch": 1.3636073967113638, + "grad_norm": 9.401432991027832, + "learning_rate": 2.7273210054810607e-05, + "loss": 2.8235, + "step": 4386500 + }, + { + "epoch": 1.3637628289918506, + "grad_norm": 9.51640510559082, + "learning_rate": 2.7270619516802494e-05, + "loss": 2.7958, + "step": 4387000 + }, + { + "epoch": 1.3639182612723375, + "grad_norm": 17.727323532104492, + "learning_rate": 2.7268028978794374e-05, + "loss": 2.7668, + "step": 4387500 + }, + { + "epoch": 1.3640736935528244, + "grad_norm": 10.942800521850586, + "learning_rate": 2.726543844078626e-05, + "loss": 2.8363, + "step": 4388000 + }, + { + "epoch": 1.3642291258333112, + "grad_norm": 11.86299991607666, + "learning_rate": 2.7262847902778145e-05, + "loss": 2.8215, + "step": 4388500 + }, + { + "epoch": 1.364384558113798, + "grad_norm": 10.027734756469727, + "learning_rate": 2.7260257364770032e-05, + "loss": 2.7864, + "step": 4389000 + }, + { + "epoch": 1.364539990394285, + "grad_norm": 9.080909729003906, + "learning_rate": 2.725766682676192e-05, + "loss": 2.804, + "step": 4389500 + }, + { + "epoch": 1.3646954226747718, + "grad_norm": 9.283770561218262, + "learning_rate": 2.72550762887538e-05, + "loss": 2.7997, + "step": 4390000 + }, + { + "epoch": 1.3648508549552587, + "grad_norm": 10.216065406799316, + "learning_rate": 2.7252485750745687e-05, + "loss": 2.7859, + "step": 4390500 + }, + { + "epoch": 1.3650062872357456, + "grad_norm": 12.563907623291016, + "learning_rate": 2.7249895212737574e-05, + "loss": 2.8356, + "step": 4391000 + }, + { + "epoch": 1.3651617195162324, + "grad_norm": 7.744449138641357, + "learning_rate": 2.7247304674729458e-05, + "loss": 2.7674, + "step": 4391500 + }, + { + "epoch": 1.3653171517967193, + "grad_norm": 7.892867088317871, + "learning_rate": 2.7244714136721345e-05, + "loss": 2.7788, + "step": 4392000 + }, + { + "epoch": 1.3654725840772064, + "grad_norm": 24.044923782348633, + "learning_rate": 2.7242123598713232e-05, + "loss": 2.7933, + "step": 4392500 + }, + { + "epoch": 1.3656280163576933, + "grad_norm": 10.085283279418945, + "learning_rate": 2.7239533060705112e-05, + "loss": 2.8151, + "step": 4393000 + }, + { + "epoch": 1.3657834486381801, + "grad_norm": 16.331310272216797, + "learning_rate": 2.7236942522697e-05, + "loss": 2.7927, + "step": 4393500 + }, + { + "epoch": 1.365938880918667, + "grad_norm": 8.172765731811523, + "learning_rate": 2.7234351984688883e-05, + "loss": 2.8397, + "step": 4394000 + }, + { + "epoch": 1.3660943131991539, + "grad_norm": 9.533488273620605, + "learning_rate": 2.723176144668077e-05, + "loss": 2.8405, + "step": 4394500 + }, + { + "epoch": 1.3662497454796407, + "grad_norm": 11.14116382598877, + "learning_rate": 2.7229170908672658e-05, + "loss": 2.8285, + "step": 4395000 + }, + { + "epoch": 1.3664051777601276, + "grad_norm": 8.40738582611084, + "learning_rate": 2.7226580370664538e-05, + "loss": 2.7915, + "step": 4395500 + }, + { + "epoch": 1.3665606100406145, + "grad_norm": 12.051042556762695, + "learning_rate": 2.7223989832656425e-05, + "loss": 2.7675, + "step": 4396000 + }, + { + "epoch": 1.3667160423211013, + "grad_norm": 6.2137956619262695, + "learning_rate": 2.7221399294648316e-05, + "loss": 2.7531, + "step": 4396500 + }, + { + "epoch": 1.3668714746015882, + "grad_norm": 11.54831314086914, + "learning_rate": 2.7218808756640196e-05, + "loss": 2.8036, + "step": 4397000 + }, + { + "epoch": 1.367026906882075, + "grad_norm": 10.517288208007812, + "learning_rate": 2.7216218218632083e-05, + "loss": 2.8053, + "step": 4397500 + }, + { + "epoch": 1.367182339162562, + "grad_norm": 5.486637592315674, + "learning_rate": 2.721362768062397e-05, + "loss": 2.8247, + "step": 4398000 + }, + { + "epoch": 1.3673377714430488, + "grad_norm": 6.628695964813232, + "learning_rate": 2.7211037142615854e-05, + "loss": 2.7833, + "step": 4398500 + }, + { + "epoch": 1.3674932037235357, + "grad_norm": 8.76664924621582, + "learning_rate": 2.720844660460774e-05, + "loss": 2.7742, + "step": 4399000 + }, + { + "epoch": 1.3676486360040225, + "grad_norm": 9.74609661102295, + "learning_rate": 2.720585606659963e-05, + "loss": 2.818, + "step": 4399500 + }, + { + "epoch": 1.3678040682845094, + "grad_norm": 8.581003189086914, + "learning_rate": 2.720326552859151e-05, + "loss": 2.8148, + "step": 4400000 + }, + { + "epoch": 1.3679595005649963, + "grad_norm": 14.062593460083008, + "learning_rate": 2.7200674990583396e-05, + "loss": 2.7141, + "step": 4400500 + }, + { + "epoch": 1.3681149328454831, + "grad_norm": 9.687220573425293, + "learning_rate": 2.719808445257528e-05, + "loss": 2.812, + "step": 4401000 + }, + { + "epoch": 1.3682703651259702, + "grad_norm": 9.818559646606445, + "learning_rate": 2.7195493914567167e-05, + "loss": 2.8397, + "step": 4401500 + }, + { + "epoch": 1.368425797406457, + "grad_norm": 6.503810882568359, + "learning_rate": 2.7192903376559054e-05, + "loss": 2.8207, + "step": 4402000 + }, + { + "epoch": 1.368581229686944, + "grad_norm": 9.46337604522705, + "learning_rate": 2.7190312838550935e-05, + "loss": 2.8214, + "step": 4402500 + }, + { + "epoch": 1.3687366619674308, + "grad_norm": 7.522940158843994, + "learning_rate": 2.718772230054282e-05, + "loss": 2.7634, + "step": 4403000 + }, + { + "epoch": 1.3688920942479177, + "grad_norm": 9.441553115844727, + "learning_rate": 2.718513176253471e-05, + "loss": 2.7841, + "step": 4403500 + }, + { + "epoch": 1.3690475265284046, + "grad_norm": 9.079694747924805, + "learning_rate": 2.7182541224526593e-05, + "loss": 2.7967, + "step": 4404000 + }, + { + "epoch": 1.3692029588088914, + "grad_norm": 12.262892723083496, + "learning_rate": 2.717995068651848e-05, + "loss": 2.799, + "step": 4404500 + }, + { + "epoch": 1.3693583910893783, + "grad_norm": 13.57042121887207, + "learning_rate": 2.7177360148510367e-05, + "loss": 2.768, + "step": 4405000 + }, + { + "epoch": 1.3695138233698652, + "grad_norm": 9.009906768798828, + "learning_rate": 2.7174769610502247e-05, + "loss": 2.7724, + "step": 4405500 + }, + { + "epoch": 1.369669255650352, + "grad_norm": 35.01792526245117, + "learning_rate": 2.7172179072494134e-05, + "loss": 2.8048, + "step": 4406000 + }, + { + "epoch": 1.369824687930839, + "grad_norm": 12.398693084716797, + "learning_rate": 2.7169588534486018e-05, + "loss": 2.8165, + "step": 4406500 + }, + { + "epoch": 1.3699801202113258, + "grad_norm": 6.531327247619629, + "learning_rate": 2.7166997996477905e-05, + "loss": 2.7794, + "step": 4407000 + }, + { + "epoch": 1.3701355524918126, + "grad_norm": 11.163582801818848, + "learning_rate": 2.7164407458469792e-05, + "loss": 2.8023, + "step": 4407500 + }, + { + "epoch": 1.3702909847722995, + "grad_norm": 10.534843444824219, + "learning_rate": 2.7161816920461676e-05, + "loss": 2.8243, + "step": 4408000 + }, + { + "epoch": 1.3704464170527864, + "grad_norm": 11.93807315826416, + "learning_rate": 2.7159226382453563e-05, + "loss": 2.8059, + "step": 4408500 + }, + { + "epoch": 1.3706018493332732, + "grad_norm": 8.914746284484863, + "learning_rate": 2.715663584444545e-05, + "loss": 2.7899, + "step": 4409000 + }, + { + "epoch": 1.37075728161376, + "grad_norm": 8.104124069213867, + "learning_rate": 2.715404530643733e-05, + "loss": 2.8074, + "step": 4409500 + }, + { + "epoch": 1.370912713894247, + "grad_norm": 9.69919490814209, + "learning_rate": 2.7151454768429218e-05, + "loss": 2.7954, + "step": 4410000 + }, + { + "epoch": 1.3710681461747338, + "grad_norm": 9.832289695739746, + "learning_rate": 2.7148864230421105e-05, + "loss": 2.8162, + "step": 4410500 + }, + { + "epoch": 1.3712235784552207, + "grad_norm": 10.176831245422363, + "learning_rate": 2.714627369241299e-05, + "loss": 2.813, + "step": 4411000 + }, + { + "epoch": 1.3713790107357076, + "grad_norm": 10.289814949035645, + "learning_rate": 2.7143683154404876e-05, + "loss": 2.7857, + "step": 4411500 + }, + { + "epoch": 1.3715344430161944, + "grad_norm": 6.811809539794922, + "learning_rate": 2.7141092616396757e-05, + "loss": 2.8518, + "step": 4412000 + }, + { + "epoch": 1.3716898752966813, + "grad_norm": 10.35531234741211, + "learning_rate": 2.7138502078388644e-05, + "loss": 2.7432, + "step": 4412500 + }, + { + "epoch": 1.3718453075771682, + "grad_norm": 11.108176231384277, + "learning_rate": 2.713591154038053e-05, + "loss": 2.8267, + "step": 4413000 + }, + { + "epoch": 1.372000739857655, + "grad_norm": 9.07666301727295, + "learning_rate": 2.7133321002372415e-05, + "loss": 2.8461, + "step": 4413500 + }, + { + "epoch": 1.372156172138142, + "grad_norm": 9.49085521697998, + "learning_rate": 2.7130730464364302e-05, + "loss": 2.8014, + "step": 4414000 + }, + { + "epoch": 1.3723116044186288, + "grad_norm": 7.630502223968506, + "learning_rate": 2.712813992635619e-05, + "loss": 2.7943, + "step": 4414500 + }, + { + "epoch": 1.3724670366991156, + "grad_norm": 9.072049140930176, + "learning_rate": 2.712554938834807e-05, + "loss": 2.8069, + "step": 4415000 + }, + { + "epoch": 1.3726224689796025, + "grad_norm": 9.591782569885254, + "learning_rate": 2.7122958850339956e-05, + "loss": 2.8306, + "step": 4415500 + }, + { + "epoch": 1.3727779012600894, + "grad_norm": 11.23294734954834, + "learning_rate": 2.7120368312331844e-05, + "loss": 2.8368, + "step": 4416000 + }, + { + "epoch": 1.3729333335405765, + "grad_norm": 10.90556812286377, + "learning_rate": 2.7117777774323727e-05, + "loss": 2.8249, + "step": 4416500 + }, + { + "epoch": 1.3730887658210633, + "grad_norm": 43.64437484741211, + "learning_rate": 2.7115187236315615e-05, + "loss": 2.7749, + "step": 4417000 + }, + { + "epoch": 1.3732441981015502, + "grad_norm": 8.354198455810547, + "learning_rate": 2.71125966983075e-05, + "loss": 2.7889, + "step": 4417500 + }, + { + "epoch": 1.373399630382037, + "grad_norm": 16.249528884887695, + "learning_rate": 2.7110006160299385e-05, + "loss": 2.8067, + "step": 4418000 + }, + { + "epoch": 1.373555062662524, + "grad_norm": 7.716569423675537, + "learning_rate": 2.7107415622291273e-05, + "loss": 2.7861, + "step": 4418500 + }, + { + "epoch": 1.3737104949430108, + "grad_norm": 14.115537643432617, + "learning_rate": 2.7104825084283153e-05, + "loss": 2.7958, + "step": 4419000 + }, + { + "epoch": 1.3738659272234977, + "grad_norm": 6.624422550201416, + "learning_rate": 2.710223454627504e-05, + "loss": 2.7543, + "step": 4419500 + }, + { + "epoch": 1.3740213595039845, + "grad_norm": 10.695996284484863, + "learning_rate": 2.7099644008266927e-05, + "loss": 2.835, + "step": 4420000 + }, + { + "epoch": 1.3741767917844714, + "grad_norm": 8.395355224609375, + "learning_rate": 2.709705347025881e-05, + "loss": 2.7968, + "step": 4420500 + }, + { + "epoch": 1.3743322240649583, + "grad_norm": 10.077319145202637, + "learning_rate": 2.7094462932250698e-05, + "loss": 2.8166, + "step": 4421000 + }, + { + "epoch": 1.3744876563454451, + "grad_norm": 11.416667938232422, + "learning_rate": 2.7091872394242585e-05, + "loss": 2.768, + "step": 4421500 + }, + { + "epoch": 1.374643088625932, + "grad_norm": 11.206177711486816, + "learning_rate": 2.7089281856234466e-05, + "loss": 2.7731, + "step": 4422000 + }, + { + "epoch": 1.3747985209064189, + "grad_norm": 7.3583173751831055, + "learning_rate": 2.7086691318226353e-05, + "loss": 2.7816, + "step": 4422500 + }, + { + "epoch": 1.3749539531869057, + "grad_norm": 9.330676078796387, + "learning_rate": 2.708410078021824e-05, + "loss": 2.7824, + "step": 4423000 + }, + { + "epoch": 1.3751093854673926, + "grad_norm": 8.318085670471191, + "learning_rate": 2.7081510242210124e-05, + "loss": 2.8115, + "step": 4423500 + }, + { + "epoch": 1.3752648177478795, + "grad_norm": 17.62772560119629, + "learning_rate": 2.707891970420201e-05, + "loss": 2.7868, + "step": 4424000 + }, + { + "epoch": 1.3754202500283663, + "grad_norm": 8.991317749023438, + "learning_rate": 2.707632916619389e-05, + "loss": 2.8001, + "step": 4424500 + }, + { + "epoch": 1.3755756823088532, + "grad_norm": 9.337248802185059, + "learning_rate": 2.707373862818578e-05, + "loss": 2.7725, + "step": 4425000 + }, + { + "epoch": 1.3757311145893403, + "grad_norm": 12.700013160705566, + "learning_rate": 2.7071148090177666e-05, + "loss": 2.8269, + "step": 4425500 + }, + { + "epoch": 1.3758865468698271, + "grad_norm": 8.019001007080078, + "learning_rate": 2.706855755216955e-05, + "loss": 2.8035, + "step": 4426000 + }, + { + "epoch": 1.376041979150314, + "grad_norm": 8.96700382232666, + "learning_rate": 2.7065967014161437e-05, + "loss": 2.7832, + "step": 4426500 + }, + { + "epoch": 1.3761974114308009, + "grad_norm": 12.328572273254395, + "learning_rate": 2.7063376476153324e-05, + "loss": 2.7976, + "step": 4427000 + }, + { + "epoch": 1.3763528437112877, + "grad_norm": 8.657023429870605, + "learning_rate": 2.7060785938145204e-05, + "loss": 2.8245, + "step": 4427500 + }, + { + "epoch": 1.3765082759917746, + "grad_norm": 9.43945598602295, + "learning_rate": 2.7058195400137095e-05, + "loss": 2.8223, + "step": 4428000 + }, + { + "epoch": 1.3766637082722615, + "grad_norm": 8.890125274658203, + "learning_rate": 2.7055604862128982e-05, + "loss": 2.7571, + "step": 4428500 + }, + { + "epoch": 1.3768191405527483, + "grad_norm": 20.468181610107422, + "learning_rate": 2.7053014324120862e-05, + "loss": 2.8135, + "step": 4429000 + }, + { + "epoch": 1.3769745728332352, + "grad_norm": 18.478872299194336, + "learning_rate": 2.705042378611275e-05, + "loss": 2.788, + "step": 4429500 + }, + { + "epoch": 1.377130005113722, + "grad_norm": 10.341055870056152, + "learning_rate": 2.7047833248104633e-05, + "loss": 2.8231, + "step": 4430000 + }, + { + "epoch": 1.377285437394209, + "grad_norm": 12.838706970214844, + "learning_rate": 2.704524271009652e-05, + "loss": 2.7968, + "step": 4430500 + }, + { + "epoch": 1.3774408696746958, + "grad_norm": 9.032247543334961, + "learning_rate": 2.7042652172088407e-05, + "loss": 2.8025, + "step": 4431000 + }, + { + "epoch": 1.3775963019551827, + "grad_norm": 9.694925308227539, + "learning_rate": 2.7040061634080288e-05, + "loss": 2.7826, + "step": 4431500 + }, + { + "epoch": 1.3777517342356695, + "grad_norm": 8.759687423706055, + "learning_rate": 2.7037471096072175e-05, + "loss": 2.7845, + "step": 4432000 + }, + { + "epoch": 1.3779071665161564, + "grad_norm": 11.43653392791748, + "learning_rate": 2.7034880558064062e-05, + "loss": 2.7812, + "step": 4432500 + }, + { + "epoch": 1.3780625987966433, + "grad_norm": 11.330557823181152, + "learning_rate": 2.7032290020055946e-05, + "loss": 2.8004, + "step": 4433000 + }, + { + "epoch": 1.3782180310771301, + "grad_norm": 7.766624450683594, + "learning_rate": 2.7029699482047833e-05, + "loss": 2.7587, + "step": 4433500 + }, + { + "epoch": 1.378373463357617, + "grad_norm": 8.003325462341309, + "learning_rate": 2.702710894403972e-05, + "loss": 2.8118, + "step": 4434000 + }, + { + "epoch": 1.3785288956381039, + "grad_norm": 9.65359878540039, + "learning_rate": 2.70245184060316e-05, + "loss": 2.781, + "step": 4434500 + }, + { + "epoch": 1.3786843279185907, + "grad_norm": 9.479591369628906, + "learning_rate": 2.7021927868023488e-05, + "loss": 2.8124, + "step": 4435000 + }, + { + "epoch": 1.3788397601990776, + "grad_norm": 31.65603256225586, + "learning_rate": 2.7019337330015375e-05, + "loss": 2.8239, + "step": 4435500 + }, + { + "epoch": 1.3789951924795645, + "grad_norm": 9.470102310180664, + "learning_rate": 2.701674679200726e-05, + "loss": 2.801, + "step": 4436000 + }, + { + "epoch": 1.3791506247600513, + "grad_norm": 9.442705154418945, + "learning_rate": 2.7014156253999146e-05, + "loss": 2.7918, + "step": 4436500 + }, + { + "epoch": 1.3793060570405382, + "grad_norm": 49.85576248168945, + "learning_rate": 2.7011565715991026e-05, + "loss": 2.7783, + "step": 4437000 + }, + { + "epoch": 1.379461489321025, + "grad_norm": 9.535101890563965, + "learning_rate": 2.7008975177982913e-05, + "loss": 2.8248, + "step": 4437500 + }, + { + "epoch": 1.379616921601512, + "grad_norm": 11.448352813720703, + "learning_rate": 2.7006384639974804e-05, + "loss": 2.8432, + "step": 4438000 + }, + { + "epoch": 1.3797723538819988, + "grad_norm": 8.859139442443848, + "learning_rate": 2.7003794101966684e-05, + "loss": 2.804, + "step": 4438500 + }, + { + "epoch": 1.3799277861624857, + "grad_norm": 35.20502471923828, + "learning_rate": 2.700120356395857e-05, + "loss": 2.7631, + "step": 4439000 + }, + { + "epoch": 1.3800832184429725, + "grad_norm": 21.87862777709961, + "learning_rate": 2.699861302595046e-05, + "loss": 2.8146, + "step": 4439500 + }, + { + "epoch": 1.3802386507234594, + "grad_norm": 9.651006698608398, + "learning_rate": 2.6996022487942342e-05, + "loss": 2.7896, + "step": 4440000 + }, + { + "epoch": 1.3803940830039465, + "grad_norm": 9.291642189025879, + "learning_rate": 2.699343194993423e-05, + "loss": 2.7972, + "step": 4440500 + }, + { + "epoch": 1.3805495152844334, + "grad_norm": 14.246456146240234, + "learning_rate": 2.6990841411926117e-05, + "loss": 2.7884, + "step": 4441000 + }, + { + "epoch": 1.3807049475649202, + "grad_norm": 7.69642972946167, + "learning_rate": 2.6988250873917997e-05, + "loss": 2.7987, + "step": 4441500 + }, + { + "epoch": 1.380860379845407, + "grad_norm": 37.27171325683594, + "learning_rate": 2.6985660335909884e-05, + "loss": 2.8165, + "step": 4442000 + }, + { + "epoch": 1.381015812125894, + "grad_norm": 7.632607460021973, + "learning_rate": 2.6983069797901768e-05, + "loss": 2.8144, + "step": 4442500 + }, + { + "epoch": 1.3811712444063808, + "grad_norm": 8.610279083251953, + "learning_rate": 2.6980479259893655e-05, + "loss": 2.7854, + "step": 4443000 + }, + { + "epoch": 1.3813266766868677, + "grad_norm": 9.4527006149292, + "learning_rate": 2.6977888721885542e-05, + "loss": 2.8595, + "step": 4443500 + }, + { + "epoch": 1.3814821089673546, + "grad_norm": 8.555150985717773, + "learning_rate": 2.6975298183877423e-05, + "loss": 2.7936, + "step": 4444000 + }, + { + "epoch": 1.3816375412478414, + "grad_norm": 7.275887966156006, + "learning_rate": 2.697270764586931e-05, + "loss": 2.7946, + "step": 4444500 + }, + { + "epoch": 1.3817929735283283, + "grad_norm": 11.152787208557129, + "learning_rate": 2.6970117107861197e-05, + "loss": 2.7955, + "step": 4445000 + }, + { + "epoch": 1.3819484058088152, + "grad_norm": 10.723825454711914, + "learning_rate": 2.696752656985308e-05, + "loss": 2.7542, + "step": 4445500 + }, + { + "epoch": 1.382103838089302, + "grad_norm": 18.00555992126465, + "learning_rate": 2.6964936031844968e-05, + "loss": 2.7743, + "step": 4446000 + }, + { + "epoch": 1.382259270369789, + "grad_norm": 8.99444580078125, + "learning_rate": 2.6962345493836855e-05, + "loss": 2.8395, + "step": 4446500 + }, + { + "epoch": 1.3824147026502758, + "grad_norm": 11.182668685913086, + "learning_rate": 2.6959754955828735e-05, + "loss": 2.811, + "step": 4447000 + }, + { + "epoch": 1.3825701349307626, + "grad_norm": 17.054162979125977, + "learning_rate": 2.6957164417820622e-05, + "loss": 2.8099, + "step": 4447500 + }, + { + "epoch": 1.3827255672112495, + "grad_norm": 8.85260009765625, + "learning_rate": 2.6954573879812506e-05, + "loss": 2.7704, + "step": 4448000 + }, + { + "epoch": 1.3828809994917364, + "grad_norm": 9.506546020507812, + "learning_rate": 2.6951983341804393e-05, + "loss": 2.7861, + "step": 4448500 + }, + { + "epoch": 1.3830364317722232, + "grad_norm": 11.737190246582031, + "learning_rate": 2.694939280379628e-05, + "loss": 2.7825, + "step": 4449000 + }, + { + "epoch": 1.3831918640527103, + "grad_norm": 11.325421333312988, + "learning_rate": 2.694680226578816e-05, + "loss": 2.774, + "step": 4449500 + }, + { + "epoch": 1.3833472963331972, + "grad_norm": 9.414069175720215, + "learning_rate": 2.694421172778005e-05, + "loss": 2.7801, + "step": 4450000 + }, + { + "epoch": 1.383502728613684, + "grad_norm": 17.96491050720215, + "learning_rate": 2.694162118977194e-05, + "loss": 2.8109, + "step": 4450500 + }, + { + "epoch": 1.383658160894171, + "grad_norm": 16.52579689025879, + "learning_rate": 2.693903065176382e-05, + "loss": 2.8034, + "step": 4451000 + }, + { + "epoch": 1.3838135931746578, + "grad_norm": 10.509295463562012, + "learning_rate": 2.6936440113755706e-05, + "loss": 2.7959, + "step": 4451500 + }, + { + "epoch": 1.3839690254551447, + "grad_norm": 12.786064147949219, + "learning_rate": 2.6933849575747593e-05, + "loss": 2.7985, + "step": 4452000 + }, + { + "epoch": 1.3841244577356315, + "grad_norm": 9.806163787841797, + "learning_rate": 2.6931259037739477e-05, + "loss": 2.811, + "step": 4452500 + }, + { + "epoch": 1.3842798900161184, + "grad_norm": 7.436089992523193, + "learning_rate": 2.6928668499731364e-05, + "loss": 2.7622, + "step": 4453000 + }, + { + "epoch": 1.3844353222966053, + "grad_norm": 18.658185958862305, + "learning_rate": 2.692607796172325e-05, + "loss": 2.8076, + "step": 4453500 + }, + { + "epoch": 1.3845907545770921, + "grad_norm": 10.857693672180176, + "learning_rate": 2.6923487423715132e-05, + "loss": 2.7997, + "step": 4454000 + }, + { + "epoch": 1.384746186857579, + "grad_norm": 11.409899711608887, + "learning_rate": 2.692089688570702e-05, + "loss": 2.8327, + "step": 4454500 + }, + { + "epoch": 1.3849016191380659, + "grad_norm": 29.28846549987793, + "learning_rate": 2.6918306347698903e-05, + "loss": 2.8068, + "step": 4455000 + }, + { + "epoch": 1.3850570514185527, + "grad_norm": 8.905523300170898, + "learning_rate": 2.691571580969079e-05, + "loss": 2.8462, + "step": 4455500 + }, + { + "epoch": 1.3852124836990396, + "grad_norm": 9.111837387084961, + "learning_rate": 2.6913125271682677e-05, + "loss": 2.8492, + "step": 4456000 + }, + { + "epoch": 1.3853679159795265, + "grad_norm": 8.313008308410645, + "learning_rate": 2.6910534733674557e-05, + "loss": 2.792, + "step": 4456500 + }, + { + "epoch": 1.3855233482600133, + "grad_norm": 9.33522891998291, + "learning_rate": 2.6907944195666444e-05, + "loss": 2.8366, + "step": 4457000 + }, + { + "epoch": 1.3856787805405002, + "grad_norm": 8.239705085754395, + "learning_rate": 2.690535365765833e-05, + "loss": 2.7997, + "step": 4457500 + }, + { + "epoch": 1.385834212820987, + "grad_norm": 10.369956016540527, + "learning_rate": 2.6902763119650215e-05, + "loss": 2.7955, + "step": 4458000 + }, + { + "epoch": 1.385989645101474, + "grad_norm": 9.987090110778809, + "learning_rate": 2.6900172581642103e-05, + "loss": 2.8358, + "step": 4458500 + }, + { + "epoch": 1.3861450773819608, + "grad_norm": 6.99656343460083, + "learning_rate": 2.689758204363399e-05, + "loss": 2.8132, + "step": 4459000 + }, + { + "epoch": 1.3863005096624477, + "grad_norm": 8.254796028137207, + "learning_rate": 2.689499150562587e-05, + "loss": 2.7883, + "step": 4459500 + }, + { + "epoch": 1.3864559419429345, + "grad_norm": 14.263772010803223, + "learning_rate": 2.689240096761776e-05, + "loss": 2.7935, + "step": 4460000 + }, + { + "epoch": 1.3866113742234214, + "grad_norm": 9.633260726928711, + "learning_rate": 2.688981042960964e-05, + "loss": 2.8194, + "step": 4460500 + }, + { + "epoch": 1.3867668065039083, + "grad_norm": 5.551095485687256, + "learning_rate": 2.6887219891601528e-05, + "loss": 2.7921, + "step": 4461000 + }, + { + "epoch": 1.3869222387843951, + "grad_norm": 8.327601432800293, + "learning_rate": 2.6884629353593415e-05, + "loss": 2.8014, + "step": 4461500 + }, + { + "epoch": 1.387077671064882, + "grad_norm": 46.8454475402832, + "learning_rate": 2.68820388155853e-05, + "loss": 2.8559, + "step": 4462000 + }, + { + "epoch": 1.3872331033453689, + "grad_norm": 16.07242774963379, + "learning_rate": 2.6879448277577186e-05, + "loss": 2.8361, + "step": 4462500 + }, + { + "epoch": 1.3873885356258557, + "grad_norm": 15.596317291259766, + "learning_rate": 2.6876857739569073e-05, + "loss": 2.8134, + "step": 4463000 + }, + { + "epoch": 1.3875439679063426, + "grad_norm": 9.330925941467285, + "learning_rate": 2.6874267201560954e-05, + "loss": 2.8308, + "step": 4463500 + }, + { + "epoch": 1.3876994001868295, + "grad_norm": 12.080370903015137, + "learning_rate": 2.687167666355284e-05, + "loss": 2.8227, + "step": 4464000 + }, + { + "epoch": 1.3878548324673166, + "grad_norm": 7.802847862243652, + "learning_rate": 2.6869086125544728e-05, + "loss": 2.798, + "step": 4464500 + }, + { + "epoch": 1.3880102647478034, + "grad_norm": 9.597042083740234, + "learning_rate": 2.6866495587536612e-05, + "loss": 2.7429, + "step": 4465000 + }, + { + "epoch": 1.3881656970282903, + "grad_norm": 8.28187084197998, + "learning_rate": 2.68639050495285e-05, + "loss": 2.8076, + "step": 4465500 + }, + { + "epoch": 1.3883211293087772, + "grad_norm": 14.91976261138916, + "learning_rate": 2.686131451152038e-05, + "loss": 2.7844, + "step": 4466000 + }, + { + "epoch": 1.388476561589264, + "grad_norm": 8.663790702819824, + "learning_rate": 2.6858723973512267e-05, + "loss": 2.7771, + "step": 4466500 + }, + { + "epoch": 1.388631993869751, + "grad_norm": 9.168683052062988, + "learning_rate": 2.6856133435504154e-05, + "loss": 2.7797, + "step": 4467000 + }, + { + "epoch": 1.3887874261502378, + "grad_norm": 12.018232345581055, + "learning_rate": 2.6853542897496037e-05, + "loss": 2.7678, + "step": 4467500 + }, + { + "epoch": 1.3889428584307246, + "grad_norm": 25.3947696685791, + "learning_rate": 2.6850952359487925e-05, + "loss": 2.8631, + "step": 4468000 + }, + { + "epoch": 1.3890982907112115, + "grad_norm": 8.337458610534668, + "learning_rate": 2.6848361821479812e-05, + "loss": 2.7709, + "step": 4468500 + }, + { + "epoch": 1.3892537229916984, + "grad_norm": 14.730055809020996, + "learning_rate": 2.6845771283471692e-05, + "loss": 2.7953, + "step": 4469000 + }, + { + "epoch": 1.3894091552721852, + "grad_norm": 18.498340606689453, + "learning_rate": 2.684318074546358e-05, + "loss": 2.7919, + "step": 4469500 + }, + { + "epoch": 1.389564587552672, + "grad_norm": 9.302492141723633, + "learning_rate": 2.684059020745547e-05, + "loss": 2.8157, + "step": 4470000 + }, + { + "epoch": 1.389720019833159, + "grad_norm": 7.444825649261475, + "learning_rate": 2.683799966944735e-05, + "loss": 2.8476, + "step": 4470500 + }, + { + "epoch": 1.3898754521136458, + "grad_norm": 9.113393783569336, + "learning_rate": 2.6835409131439237e-05, + "loss": 2.7375, + "step": 4471000 + }, + { + "epoch": 1.3900308843941327, + "grad_norm": 7.607105731964111, + "learning_rate": 2.6832818593431125e-05, + "loss": 2.8091, + "step": 4471500 + }, + { + "epoch": 1.3901863166746196, + "grad_norm": 8.55292797088623, + "learning_rate": 2.6830228055423008e-05, + "loss": 2.7884, + "step": 4472000 + }, + { + "epoch": 1.3903417489551064, + "grad_norm": 8.055218696594238, + "learning_rate": 2.6827637517414895e-05, + "loss": 2.792, + "step": 4472500 + }, + { + "epoch": 1.3904971812355933, + "grad_norm": 9.353911399841309, + "learning_rate": 2.6825046979406776e-05, + "loss": 2.7927, + "step": 4473000 + }, + { + "epoch": 1.3906526135160804, + "grad_norm": 6.5291924476623535, + "learning_rate": 2.6822456441398663e-05, + "loss": 2.7733, + "step": 4473500 + }, + { + "epoch": 1.3908080457965672, + "grad_norm": 11.549114227294922, + "learning_rate": 2.681986590339055e-05, + "loss": 2.7963, + "step": 4474000 + }, + { + "epoch": 1.3909634780770541, + "grad_norm": 8.345335960388184, + "learning_rate": 2.6817275365382434e-05, + "loss": 2.7874, + "step": 4474500 + }, + { + "epoch": 1.391118910357541, + "grad_norm": 10.336125373840332, + "learning_rate": 2.681468482737432e-05, + "loss": 2.7931, + "step": 4475000 + }, + { + "epoch": 1.3912743426380278, + "grad_norm": 9.853302001953125, + "learning_rate": 2.6812094289366208e-05, + "loss": 2.8236, + "step": 4475500 + }, + { + "epoch": 1.3914297749185147, + "grad_norm": 16.876012802124023, + "learning_rate": 2.680950375135809e-05, + "loss": 2.8147, + "step": 4476000 + }, + { + "epoch": 1.3915852071990016, + "grad_norm": 9.716439247131348, + "learning_rate": 2.6806913213349976e-05, + "loss": 2.8043, + "step": 4476500 + }, + { + "epoch": 1.3917406394794885, + "grad_norm": 11.512262344360352, + "learning_rate": 2.6804322675341863e-05, + "loss": 2.7449, + "step": 4477000 + }, + { + "epoch": 1.3918960717599753, + "grad_norm": 8.961003303527832, + "learning_rate": 2.6801732137333747e-05, + "loss": 2.7979, + "step": 4477500 + }, + { + "epoch": 1.3920515040404622, + "grad_norm": 8.156964302062988, + "learning_rate": 2.6799141599325634e-05, + "loss": 2.7858, + "step": 4478000 + }, + { + "epoch": 1.392206936320949, + "grad_norm": 8.317987442016602, + "learning_rate": 2.6796551061317514e-05, + "loss": 2.764, + "step": 4478500 + }, + { + "epoch": 1.392362368601436, + "grad_norm": 14.179770469665527, + "learning_rate": 2.67939605233094e-05, + "loss": 2.7521, + "step": 4479000 + }, + { + "epoch": 1.3925178008819228, + "grad_norm": 17.223169326782227, + "learning_rate": 2.679136998530129e-05, + "loss": 2.8196, + "step": 4479500 + }, + { + "epoch": 1.3926732331624097, + "grad_norm": 10.9318208694458, + "learning_rate": 2.6788779447293172e-05, + "loss": 2.7865, + "step": 4480000 + }, + { + "epoch": 1.3928286654428965, + "grad_norm": 9.877053260803223, + "learning_rate": 2.678618890928506e-05, + "loss": 2.8221, + "step": 4480500 + }, + { + "epoch": 1.3929840977233834, + "grad_norm": 10.339855194091797, + "learning_rate": 2.6783598371276947e-05, + "loss": 2.8017, + "step": 4481000 + }, + { + "epoch": 1.3931395300038703, + "grad_norm": 34.8723258972168, + "learning_rate": 2.6781007833268827e-05, + "loss": 2.7984, + "step": 4481500 + }, + { + "epoch": 1.3932949622843571, + "grad_norm": 10.56783676147461, + "learning_rate": 2.6778417295260717e-05, + "loss": 2.7979, + "step": 4482000 + }, + { + "epoch": 1.393450394564844, + "grad_norm": 9.662239074707031, + "learning_rate": 2.6775826757252605e-05, + "loss": 2.7781, + "step": 4482500 + }, + { + "epoch": 1.3936058268453309, + "grad_norm": 11.359755516052246, + "learning_rate": 2.6773236219244485e-05, + "loss": 2.8064, + "step": 4483000 + }, + { + "epoch": 1.3937612591258177, + "grad_norm": 8.944417953491211, + "learning_rate": 2.6770645681236372e-05, + "loss": 2.7975, + "step": 4483500 + }, + { + "epoch": 1.3939166914063046, + "grad_norm": 7.504831314086914, + "learning_rate": 2.6768055143228256e-05, + "loss": 2.8255, + "step": 4484000 + }, + { + "epoch": 1.3940721236867915, + "grad_norm": 8.499293327331543, + "learning_rate": 2.6765464605220143e-05, + "loss": 2.8406, + "step": 4484500 + }, + { + "epoch": 1.3942275559672783, + "grad_norm": 6.096229553222656, + "learning_rate": 2.676287406721203e-05, + "loss": 2.8122, + "step": 4485000 + }, + { + "epoch": 1.3943829882477652, + "grad_norm": 9.041154861450195, + "learning_rate": 2.676028352920391e-05, + "loss": 2.8498, + "step": 4485500 + }, + { + "epoch": 1.394538420528252, + "grad_norm": 12.061854362487793, + "learning_rate": 2.6757692991195798e-05, + "loss": 2.808, + "step": 4486000 + }, + { + "epoch": 1.394693852808739, + "grad_norm": 8.49205207824707, + "learning_rate": 2.6755102453187685e-05, + "loss": 2.8054, + "step": 4486500 + }, + { + "epoch": 1.3948492850892258, + "grad_norm": 8.921133995056152, + "learning_rate": 2.675251191517957e-05, + "loss": 2.7574, + "step": 4487000 + }, + { + "epoch": 1.3950047173697127, + "grad_norm": 7.494865894317627, + "learning_rate": 2.6749921377171456e-05, + "loss": 2.8395, + "step": 4487500 + }, + { + "epoch": 1.3951601496501995, + "grad_norm": 10.792182922363281, + "learning_rate": 2.6747330839163343e-05, + "loss": 2.8369, + "step": 4488000 + }, + { + "epoch": 1.3953155819306866, + "grad_norm": 8.512145042419434, + "learning_rate": 2.6744740301155223e-05, + "loss": 2.7779, + "step": 4488500 + }, + { + "epoch": 1.3954710142111735, + "grad_norm": 10.660477638244629, + "learning_rate": 2.674214976314711e-05, + "loss": 2.7436, + "step": 4489000 + }, + { + "epoch": 1.3956264464916603, + "grad_norm": 11.598603248596191, + "learning_rate": 2.6739559225138998e-05, + "loss": 2.7821, + "step": 4489500 + }, + { + "epoch": 1.3957818787721472, + "grad_norm": 10.377592086791992, + "learning_rate": 2.673696868713088e-05, + "loss": 2.8023, + "step": 4490000 + }, + { + "epoch": 1.395937311052634, + "grad_norm": 9.454526901245117, + "learning_rate": 2.673437814912277e-05, + "loss": 2.7596, + "step": 4490500 + }, + { + "epoch": 1.396092743333121, + "grad_norm": 8.991732597351074, + "learning_rate": 2.673178761111465e-05, + "loss": 2.8103, + "step": 4491000 + }, + { + "epoch": 1.3962481756136078, + "grad_norm": 13.978073120117188, + "learning_rate": 2.6729197073106536e-05, + "loss": 2.8259, + "step": 4491500 + }, + { + "epoch": 1.3964036078940947, + "grad_norm": 7.397646903991699, + "learning_rate": 2.6726606535098427e-05, + "loss": 2.8127, + "step": 4492000 + }, + { + "epoch": 1.3965590401745815, + "grad_norm": 9.582259178161621, + "learning_rate": 2.6724015997090307e-05, + "loss": 2.8174, + "step": 4492500 + }, + { + "epoch": 1.3967144724550684, + "grad_norm": 7.536062717437744, + "learning_rate": 2.6721425459082194e-05, + "loss": 2.8035, + "step": 4493000 + }, + { + "epoch": 1.3968699047355553, + "grad_norm": 5.894103527069092, + "learning_rate": 2.671883492107408e-05, + "loss": 2.7899, + "step": 4493500 + }, + { + "epoch": 1.3970253370160421, + "grad_norm": 10.031575202941895, + "learning_rate": 2.6716244383065965e-05, + "loss": 2.8481, + "step": 4494000 + }, + { + "epoch": 1.397180769296529, + "grad_norm": 10.842978477478027, + "learning_rate": 2.6713653845057852e-05, + "loss": 2.8024, + "step": 4494500 + }, + { + "epoch": 1.3973362015770159, + "grad_norm": 8.39716911315918, + "learning_rate": 2.671106330704974e-05, + "loss": 2.8399, + "step": 4495000 + }, + { + "epoch": 1.3974916338575027, + "grad_norm": 9.300857543945312, + "learning_rate": 2.670847276904162e-05, + "loss": 2.8017, + "step": 4495500 + }, + { + "epoch": 1.3976470661379896, + "grad_norm": 9.100784301757812, + "learning_rate": 2.6705882231033507e-05, + "loss": 2.7777, + "step": 4496000 + }, + { + "epoch": 1.3978024984184765, + "grad_norm": 13.364859580993652, + "learning_rate": 2.670329169302539e-05, + "loss": 2.7745, + "step": 4496500 + }, + { + "epoch": 1.3979579306989633, + "grad_norm": 9.59040641784668, + "learning_rate": 2.6700701155017278e-05, + "loss": 2.8441, + "step": 4497000 + }, + { + "epoch": 1.3981133629794504, + "grad_norm": 8.588595390319824, + "learning_rate": 2.6698110617009165e-05, + "loss": 2.7605, + "step": 4497500 + }, + { + "epoch": 1.3982687952599373, + "grad_norm": 8.521146774291992, + "learning_rate": 2.6695520079001045e-05, + "loss": 2.7881, + "step": 4498000 + }, + { + "epoch": 1.3984242275404242, + "grad_norm": 8.800176620483398, + "learning_rate": 2.6692929540992933e-05, + "loss": 2.7982, + "step": 4498500 + }, + { + "epoch": 1.398579659820911, + "grad_norm": 14.647132873535156, + "learning_rate": 2.669033900298482e-05, + "loss": 2.7972, + "step": 4499000 + }, + { + "epoch": 1.398735092101398, + "grad_norm": 7.989153861999512, + "learning_rate": 2.6687748464976703e-05, + "loss": 2.8156, + "step": 4499500 + }, + { + "epoch": 1.3988905243818848, + "grad_norm": 9.370591163635254, + "learning_rate": 2.668515792696859e-05, + "loss": 2.7974, + "step": 4500000 + }, + { + "epoch": 1.3990459566623716, + "grad_norm": 8.977374076843262, + "learning_rate": 2.6682567388960478e-05, + "loss": 2.8249, + "step": 4500500 + }, + { + "epoch": 1.3992013889428585, + "grad_norm": 9.180310249328613, + "learning_rate": 2.6679976850952358e-05, + "loss": 2.7825, + "step": 4501000 + }, + { + "epoch": 1.3993568212233454, + "grad_norm": 10.068610191345215, + "learning_rate": 2.6677386312944245e-05, + "loss": 2.779, + "step": 4501500 + }, + { + "epoch": 1.3995122535038322, + "grad_norm": 14.034005165100098, + "learning_rate": 2.667479577493613e-05, + "loss": 2.7925, + "step": 4502000 + }, + { + "epoch": 1.399667685784319, + "grad_norm": 8.677120208740234, + "learning_rate": 2.6672205236928016e-05, + "loss": 2.7867, + "step": 4502500 + }, + { + "epoch": 1.399823118064806, + "grad_norm": 7.633734226226807, + "learning_rate": 2.6669614698919903e-05, + "loss": 2.7654, + "step": 4503000 + }, + { + "epoch": 1.3999785503452928, + "grad_norm": 8.98963451385498, + "learning_rate": 2.6667024160911787e-05, + "loss": 2.7737, + "step": 4503500 + }, + { + "epoch": 1.4001339826257797, + "grad_norm": 9.938183784484863, + "learning_rate": 2.6664433622903674e-05, + "loss": 2.813, + "step": 4504000 + }, + { + "epoch": 1.4002894149062666, + "grad_norm": 12.204302787780762, + "learning_rate": 2.666184308489556e-05, + "loss": 2.8081, + "step": 4504500 + }, + { + "epoch": 1.4004448471867534, + "grad_norm": 10.592387199401855, + "learning_rate": 2.6659252546887442e-05, + "loss": 2.7711, + "step": 4505000 + }, + { + "epoch": 1.4006002794672403, + "grad_norm": 10.989477157592773, + "learning_rate": 2.665666200887933e-05, + "loss": 2.7753, + "step": 4505500 + }, + { + "epoch": 1.4007557117477272, + "grad_norm": 11.247836112976074, + "learning_rate": 2.6654071470871216e-05, + "loss": 2.8234, + "step": 4506000 + }, + { + "epoch": 1.400911144028214, + "grad_norm": 23.60127067565918, + "learning_rate": 2.66514809328631e-05, + "loss": 2.813, + "step": 4506500 + }, + { + "epoch": 1.401066576308701, + "grad_norm": 11.827484130859375, + "learning_rate": 2.6648890394854987e-05, + "loss": 2.7917, + "step": 4507000 + }, + { + "epoch": 1.4012220085891878, + "grad_norm": 43.07888412475586, + "learning_rate": 2.6646299856846874e-05, + "loss": 2.7933, + "step": 4507500 + }, + { + "epoch": 1.4013774408696746, + "grad_norm": 7.6812567710876465, + "learning_rate": 2.6643709318838755e-05, + "loss": 2.8249, + "step": 4508000 + }, + { + "epoch": 1.4015328731501615, + "grad_norm": 8.215930938720703, + "learning_rate": 2.6641118780830642e-05, + "loss": 2.7795, + "step": 4508500 + }, + { + "epoch": 1.4016883054306484, + "grad_norm": 9.181672096252441, + "learning_rate": 2.6638528242822525e-05, + "loss": 2.8156, + "step": 4509000 + }, + { + "epoch": 1.4018437377111352, + "grad_norm": 9.708134651184082, + "learning_rate": 2.6635937704814413e-05, + "loss": 2.784, + "step": 4509500 + }, + { + "epoch": 1.401999169991622, + "grad_norm": 17.988807678222656, + "learning_rate": 2.66333471668063e-05, + "loss": 2.8097, + "step": 4510000 + }, + { + "epoch": 1.402154602272109, + "grad_norm": 8.121528625488281, + "learning_rate": 2.663075662879818e-05, + "loss": 2.7748, + "step": 4510500 + }, + { + "epoch": 1.4023100345525958, + "grad_norm": 7.399413585662842, + "learning_rate": 2.6628166090790067e-05, + "loss": 2.8106, + "step": 4511000 + }, + { + "epoch": 1.4024654668330827, + "grad_norm": 9.65169620513916, + "learning_rate": 2.6625575552781954e-05, + "loss": 2.7868, + "step": 4511500 + }, + { + "epoch": 1.4026208991135696, + "grad_norm": 8.556903839111328, + "learning_rate": 2.6622985014773838e-05, + "loss": 2.8067, + "step": 4512000 + }, + { + "epoch": 1.4027763313940567, + "grad_norm": 9.912285804748535, + "learning_rate": 2.6620394476765725e-05, + "loss": 2.8137, + "step": 4512500 + }, + { + "epoch": 1.4029317636745435, + "grad_norm": 12.991293907165527, + "learning_rate": 2.6617803938757613e-05, + "loss": 2.7678, + "step": 4513000 + }, + { + "epoch": 1.4030871959550304, + "grad_norm": 10.013129234313965, + "learning_rate": 2.6615213400749496e-05, + "loss": 2.8286, + "step": 4513500 + }, + { + "epoch": 1.4032426282355173, + "grad_norm": 8.803290367126465, + "learning_rate": 2.6612622862741383e-05, + "loss": 2.8193, + "step": 4514000 + }, + { + "epoch": 1.4033980605160041, + "grad_norm": 13.939518928527832, + "learning_rate": 2.6610032324733264e-05, + "loss": 2.7872, + "step": 4514500 + }, + { + "epoch": 1.403553492796491, + "grad_norm": 14.887900352478027, + "learning_rate": 2.660744178672515e-05, + "loss": 2.8242, + "step": 4515000 + }, + { + "epoch": 1.4037089250769779, + "grad_norm": 9.87843132019043, + "learning_rate": 2.6604851248717038e-05, + "loss": 2.7552, + "step": 4515500 + }, + { + "epoch": 1.4038643573574647, + "grad_norm": 39.38836669921875, + "learning_rate": 2.6602260710708922e-05, + "loss": 2.7587, + "step": 4516000 + }, + { + "epoch": 1.4040197896379516, + "grad_norm": 7.117115020751953, + "learning_rate": 2.659967017270081e-05, + "loss": 2.8022, + "step": 4516500 + }, + { + "epoch": 1.4041752219184385, + "grad_norm": 9.941133499145508, + "learning_rate": 2.6597079634692696e-05, + "loss": 2.8183, + "step": 4517000 + }, + { + "epoch": 1.4043306541989253, + "grad_norm": 8.441349029541016, + "learning_rate": 2.6594489096684577e-05, + "loss": 2.786, + "step": 4517500 + }, + { + "epoch": 1.4044860864794122, + "grad_norm": 13.126039505004883, + "learning_rate": 2.6591898558676464e-05, + "loss": 2.8444, + "step": 4518000 + }, + { + "epoch": 1.404641518759899, + "grad_norm": 13.517306327819824, + "learning_rate": 2.658930802066835e-05, + "loss": 2.8264, + "step": 4518500 + }, + { + "epoch": 1.404796951040386, + "grad_norm": 8.523157119750977, + "learning_rate": 2.6586717482660235e-05, + "loss": 2.8052, + "step": 4519000 + }, + { + "epoch": 1.4049523833208728, + "grad_norm": 17.678852081298828, + "learning_rate": 2.6584126944652122e-05, + "loss": 2.8401, + "step": 4519500 + }, + { + "epoch": 1.4051078156013597, + "grad_norm": 8.998528480529785, + "learning_rate": 2.658153640664401e-05, + "loss": 2.8015, + "step": 4520000 + }, + { + "epoch": 1.4052632478818465, + "grad_norm": 10.002030372619629, + "learning_rate": 2.657894586863589e-05, + "loss": 2.8044, + "step": 4520500 + }, + { + "epoch": 1.4054186801623334, + "grad_norm": 9.366726875305176, + "learning_rate": 2.6576355330627777e-05, + "loss": 2.7883, + "step": 4521000 + }, + { + "epoch": 1.4055741124428203, + "grad_norm": 10.653595924377441, + "learning_rate": 2.657376479261966e-05, + "loss": 2.8137, + "step": 4521500 + }, + { + "epoch": 1.4057295447233074, + "grad_norm": 22.933589935302734, + "learning_rate": 2.6571174254611547e-05, + "loss": 2.7926, + "step": 4522000 + }, + { + "epoch": 1.4058849770037942, + "grad_norm": 8.215731620788574, + "learning_rate": 2.6568583716603435e-05, + "loss": 2.7924, + "step": 4522500 + }, + { + "epoch": 1.406040409284281, + "grad_norm": 8.57745361328125, + "learning_rate": 2.6565993178595315e-05, + "loss": 2.8126, + "step": 4523000 + }, + { + "epoch": 1.406195841564768, + "grad_norm": 15.961167335510254, + "learning_rate": 2.6563402640587205e-05, + "loss": 2.7969, + "step": 4523500 + }, + { + "epoch": 1.4063512738452548, + "grad_norm": 12.447580337524414, + "learning_rate": 2.6560812102579093e-05, + "loss": 2.8304, + "step": 4524000 + }, + { + "epoch": 1.4065067061257417, + "grad_norm": 34.519683837890625, + "learning_rate": 2.6558221564570973e-05, + "loss": 2.7604, + "step": 4524500 + }, + { + "epoch": 1.4066621384062286, + "grad_norm": 10.597917556762695, + "learning_rate": 2.655563102656286e-05, + "loss": 2.7804, + "step": 4525000 + }, + { + "epoch": 1.4068175706867154, + "grad_norm": 7.635678291320801, + "learning_rate": 2.6553040488554747e-05, + "loss": 2.8766, + "step": 4525500 + }, + { + "epoch": 1.4069730029672023, + "grad_norm": 8.774572372436523, + "learning_rate": 2.655044995054663e-05, + "loss": 2.7994, + "step": 4526000 + }, + { + "epoch": 1.4071284352476892, + "grad_norm": 7.273867607116699, + "learning_rate": 2.6547859412538518e-05, + "loss": 2.7799, + "step": 4526500 + }, + { + "epoch": 1.407283867528176, + "grad_norm": 8.923027992248535, + "learning_rate": 2.65452688745304e-05, + "loss": 2.8282, + "step": 4527000 + }, + { + "epoch": 1.407439299808663, + "grad_norm": 8.954500198364258, + "learning_rate": 2.6542678336522286e-05, + "loss": 2.7968, + "step": 4527500 + }, + { + "epoch": 1.4075947320891498, + "grad_norm": 8.301506042480469, + "learning_rate": 2.6540087798514173e-05, + "loss": 2.7779, + "step": 4528000 + }, + { + "epoch": 1.4077501643696366, + "grad_norm": 8.595893859863281, + "learning_rate": 2.6537497260506057e-05, + "loss": 2.8287, + "step": 4528500 + }, + { + "epoch": 1.4079055966501235, + "grad_norm": 7.100897789001465, + "learning_rate": 2.6534906722497944e-05, + "loss": 2.8116, + "step": 4529000 + }, + { + "epoch": 1.4080610289306104, + "grad_norm": 9.865517616271973, + "learning_rate": 2.653231618448983e-05, + "loss": 2.8012, + "step": 4529500 + }, + { + "epoch": 1.4082164612110972, + "grad_norm": 10.96525764465332, + "learning_rate": 2.652972564648171e-05, + "loss": 2.8112, + "step": 4530000 + }, + { + "epoch": 1.408371893491584, + "grad_norm": 9.991701126098633, + "learning_rate": 2.65271351084736e-05, + "loss": 2.8404, + "step": 4530500 + }, + { + "epoch": 1.408527325772071, + "grad_norm": 7.8167290687561035, + "learning_rate": 2.6524544570465486e-05, + "loss": 2.8014, + "step": 4531000 + }, + { + "epoch": 1.4086827580525578, + "grad_norm": 8.189706802368164, + "learning_rate": 2.652195403245737e-05, + "loss": 2.8236, + "step": 4531500 + }, + { + "epoch": 1.4088381903330447, + "grad_norm": 27.599267959594727, + "learning_rate": 2.6519363494449257e-05, + "loss": 2.7465, + "step": 4532000 + }, + { + "epoch": 1.4089936226135316, + "grad_norm": 13.566033363342285, + "learning_rate": 2.6516772956441137e-05, + "loss": 2.7825, + "step": 4532500 + }, + { + "epoch": 1.4091490548940184, + "grad_norm": 12.388022422790527, + "learning_rate": 2.6514182418433024e-05, + "loss": 2.7877, + "step": 4533000 + }, + { + "epoch": 1.4093044871745053, + "grad_norm": 9.831328392028809, + "learning_rate": 2.6511591880424915e-05, + "loss": 2.7955, + "step": 4533500 + }, + { + "epoch": 1.4094599194549922, + "grad_norm": 8.782076835632324, + "learning_rate": 2.6509001342416795e-05, + "loss": 2.7849, + "step": 4534000 + }, + { + "epoch": 1.409615351735479, + "grad_norm": 13.161491394042969, + "learning_rate": 2.6506410804408682e-05, + "loss": 2.8144, + "step": 4534500 + }, + { + "epoch": 1.409770784015966, + "grad_norm": 10.530889511108398, + "learning_rate": 2.650382026640057e-05, + "loss": 2.7999, + "step": 4535000 + }, + { + "epoch": 1.4099262162964528, + "grad_norm": 23.218828201293945, + "learning_rate": 2.6501229728392453e-05, + "loss": 2.8397, + "step": 4535500 + }, + { + "epoch": 1.4100816485769396, + "grad_norm": 20.06774139404297, + "learning_rate": 2.649863919038434e-05, + "loss": 2.7824, + "step": 4536000 + }, + { + "epoch": 1.4102370808574265, + "grad_norm": 7.458800315856934, + "learning_rate": 2.6496048652376227e-05, + "loss": 2.814, + "step": 4536500 + }, + { + "epoch": 1.4103925131379136, + "grad_norm": 11.936882019042969, + "learning_rate": 2.6493458114368108e-05, + "loss": 2.8334, + "step": 4537000 + }, + { + "epoch": 1.4105479454184005, + "grad_norm": 8.168903350830078, + "learning_rate": 2.6490867576359995e-05, + "loss": 2.7758, + "step": 4537500 + }, + { + "epoch": 1.4107033776988873, + "grad_norm": 11.191617965698242, + "learning_rate": 2.6488277038351882e-05, + "loss": 2.8116, + "step": 4538000 + }, + { + "epoch": 1.4108588099793742, + "grad_norm": 10.033514022827148, + "learning_rate": 2.6485686500343766e-05, + "loss": 2.7363, + "step": 4538500 + }, + { + "epoch": 1.411014242259861, + "grad_norm": 8.9967041015625, + "learning_rate": 2.6483095962335653e-05, + "loss": 2.8035, + "step": 4539000 + }, + { + "epoch": 1.411169674540348, + "grad_norm": 10.642633438110352, + "learning_rate": 2.6480505424327533e-05, + "loss": 2.815, + "step": 4539500 + }, + { + "epoch": 1.4113251068208348, + "grad_norm": 9.549165725708008, + "learning_rate": 2.647791488631942e-05, + "loss": 2.8017, + "step": 4540000 + }, + { + "epoch": 1.4114805391013217, + "grad_norm": 10.006218910217285, + "learning_rate": 2.6475324348311308e-05, + "loss": 2.7755, + "step": 4540500 + }, + { + "epoch": 1.4116359713818085, + "grad_norm": 10.148252487182617, + "learning_rate": 2.647273381030319e-05, + "loss": 2.8208, + "step": 4541000 + }, + { + "epoch": 1.4117914036622954, + "grad_norm": 9.266820907592773, + "learning_rate": 2.647014327229508e-05, + "loss": 2.7912, + "step": 4541500 + }, + { + "epoch": 1.4119468359427823, + "grad_norm": 10.361095428466797, + "learning_rate": 2.6467552734286966e-05, + "loss": 2.7712, + "step": 4542000 + }, + { + "epoch": 1.4121022682232691, + "grad_norm": 9.000025749206543, + "learning_rate": 2.6464962196278846e-05, + "loss": 2.7736, + "step": 4542500 + }, + { + "epoch": 1.412257700503756, + "grad_norm": 8.66855239868164, + "learning_rate": 2.6462371658270733e-05, + "loss": 2.7538, + "step": 4543000 + }, + { + "epoch": 1.4124131327842429, + "grad_norm": 13.45615291595459, + "learning_rate": 2.6459781120262624e-05, + "loss": 2.7991, + "step": 4543500 + }, + { + "epoch": 1.4125685650647297, + "grad_norm": 10.31663990020752, + "learning_rate": 2.6457190582254504e-05, + "loss": 2.7923, + "step": 4544000 + }, + { + "epoch": 1.4127239973452166, + "grad_norm": 10.106629371643066, + "learning_rate": 2.645460004424639e-05, + "loss": 2.8121, + "step": 4544500 + }, + { + "epoch": 1.4128794296257035, + "grad_norm": 24.63553237915039, + "learning_rate": 2.6452009506238272e-05, + "loss": 2.7961, + "step": 4545000 + }, + { + "epoch": 1.4130348619061903, + "grad_norm": 12.799239158630371, + "learning_rate": 2.6449418968230162e-05, + "loss": 2.7637, + "step": 4545500 + }, + { + "epoch": 1.4131902941866774, + "grad_norm": 32.72403335571289, + "learning_rate": 2.644682843022205e-05, + "loss": 2.7908, + "step": 4546000 + }, + { + "epoch": 1.4133457264671643, + "grad_norm": 9.870182991027832, + "learning_rate": 2.644423789221393e-05, + "loss": 2.7856, + "step": 4546500 + }, + { + "epoch": 1.4135011587476511, + "grad_norm": 11.019413948059082, + "learning_rate": 2.6441647354205817e-05, + "loss": 2.8454, + "step": 4547000 + }, + { + "epoch": 1.413656591028138, + "grad_norm": 10.846394538879395, + "learning_rate": 2.6439056816197704e-05, + "loss": 2.7787, + "step": 4547500 + }, + { + "epoch": 1.4138120233086249, + "grad_norm": 9.289775848388672, + "learning_rate": 2.6436466278189588e-05, + "loss": 2.7984, + "step": 4548000 + }, + { + "epoch": 1.4139674555891117, + "grad_norm": 9.652724266052246, + "learning_rate": 2.6433875740181475e-05, + "loss": 2.7869, + "step": 4548500 + }, + { + "epoch": 1.4141228878695986, + "grad_norm": 9.274201393127441, + "learning_rate": 2.6431285202173362e-05, + "loss": 2.8322, + "step": 4549000 + }, + { + "epoch": 1.4142783201500855, + "grad_norm": 10.677505493164062, + "learning_rate": 2.6428694664165243e-05, + "loss": 2.8128, + "step": 4549500 + }, + { + "epoch": 1.4144337524305723, + "grad_norm": 8.85865592956543, + "learning_rate": 2.642610412615713e-05, + "loss": 2.8257, + "step": 4550000 + }, + { + "epoch": 1.4145891847110592, + "grad_norm": 9.291104316711426, + "learning_rate": 2.6423513588149014e-05, + "loss": 2.8046, + "step": 4550500 + }, + { + "epoch": 1.414744616991546, + "grad_norm": 11.7376070022583, + "learning_rate": 2.64209230501409e-05, + "loss": 2.7621, + "step": 4551000 + }, + { + "epoch": 1.414900049272033, + "grad_norm": 9.94991683959961, + "learning_rate": 2.6418332512132788e-05, + "loss": 2.814, + "step": 4551500 + }, + { + "epoch": 1.4150554815525198, + "grad_norm": 9.891258239746094, + "learning_rate": 2.6415741974124668e-05, + "loss": 2.7636, + "step": 4552000 + }, + { + "epoch": 1.4152109138330067, + "grad_norm": 11.78282356262207, + "learning_rate": 2.6413151436116555e-05, + "loss": 2.8255, + "step": 4552500 + }, + { + "epoch": 1.4153663461134935, + "grad_norm": 9.065271377563477, + "learning_rate": 2.6410560898108443e-05, + "loss": 2.7761, + "step": 4553000 + }, + { + "epoch": 1.4155217783939804, + "grad_norm": 11.761415481567383, + "learning_rate": 2.6407970360100326e-05, + "loss": 2.7936, + "step": 4553500 + }, + { + "epoch": 1.4156772106744673, + "grad_norm": 9.986536026000977, + "learning_rate": 2.6405379822092213e-05, + "loss": 2.7908, + "step": 4554000 + }, + { + "epoch": 1.4158326429549541, + "grad_norm": 8.818201065063477, + "learning_rate": 2.64027892840841e-05, + "loss": 2.7648, + "step": 4554500 + }, + { + "epoch": 1.415988075235441, + "grad_norm": 9.77673053741455, + "learning_rate": 2.640019874607598e-05, + "loss": 2.7779, + "step": 4555000 + }, + { + "epoch": 1.4161435075159279, + "grad_norm": 10.44826602935791, + "learning_rate": 2.639760820806787e-05, + "loss": 2.7564, + "step": 4555500 + }, + { + "epoch": 1.4162989397964147, + "grad_norm": 7.7499237060546875, + "learning_rate": 2.639501767005976e-05, + "loss": 2.8309, + "step": 4556000 + }, + { + "epoch": 1.4164543720769016, + "grad_norm": 8.563206672668457, + "learning_rate": 2.639242713205164e-05, + "loss": 2.7903, + "step": 4556500 + }, + { + "epoch": 1.4166098043573885, + "grad_norm": 9.604844093322754, + "learning_rate": 2.6389836594043526e-05, + "loss": 2.8329, + "step": 4557000 + }, + { + "epoch": 1.4167652366378753, + "grad_norm": 9.325109481811523, + "learning_rate": 2.638724605603541e-05, + "loss": 2.8587, + "step": 4557500 + }, + { + "epoch": 1.4169206689183622, + "grad_norm": 11.141215324401855, + "learning_rate": 2.6384655518027297e-05, + "loss": 2.7926, + "step": 4558000 + }, + { + "epoch": 1.417076101198849, + "grad_norm": 9.11960220336914, + "learning_rate": 2.6382064980019184e-05, + "loss": 2.7567, + "step": 4558500 + }, + { + "epoch": 1.417231533479336, + "grad_norm": 8.349119186401367, + "learning_rate": 2.6379474442011065e-05, + "loss": 2.8184, + "step": 4559000 + }, + { + "epoch": 1.4173869657598228, + "grad_norm": 8.306509017944336, + "learning_rate": 2.6376883904002952e-05, + "loss": 2.8081, + "step": 4559500 + }, + { + "epoch": 1.4175423980403097, + "grad_norm": 10.852267265319824, + "learning_rate": 2.637429336599484e-05, + "loss": 2.8135, + "step": 4560000 + }, + { + "epoch": 1.4176978303207965, + "grad_norm": 39.56395721435547, + "learning_rate": 2.6371702827986723e-05, + "loss": 2.7701, + "step": 4560500 + }, + { + "epoch": 1.4178532626012836, + "grad_norm": 8.766744613647461, + "learning_rate": 2.636911228997861e-05, + "loss": 2.7872, + "step": 4561000 + }, + { + "epoch": 1.4180086948817705, + "grad_norm": 10.779565811157227, + "learning_rate": 2.6366521751970497e-05, + "loss": 2.78, + "step": 4561500 + }, + { + "epoch": 1.4181641271622574, + "grad_norm": 9.32694149017334, + "learning_rate": 2.6363931213962377e-05, + "loss": 2.7956, + "step": 4562000 + }, + { + "epoch": 1.4183195594427442, + "grad_norm": 15.57890510559082, + "learning_rate": 2.6361340675954265e-05, + "loss": 2.8353, + "step": 4562500 + }, + { + "epoch": 1.418474991723231, + "grad_norm": 9.488483428955078, + "learning_rate": 2.635875013794615e-05, + "loss": 2.8458, + "step": 4563000 + }, + { + "epoch": 1.418630424003718, + "grad_norm": 8.659285545349121, + "learning_rate": 2.6356159599938035e-05, + "loss": 2.8032, + "step": 4563500 + }, + { + "epoch": 1.4187858562842048, + "grad_norm": 11.124778747558594, + "learning_rate": 2.6353569061929923e-05, + "loss": 2.7667, + "step": 4564000 + }, + { + "epoch": 1.4189412885646917, + "grad_norm": 8.460487365722656, + "learning_rate": 2.6350978523921803e-05, + "loss": 2.7939, + "step": 4564500 + }, + { + "epoch": 1.4190967208451786, + "grad_norm": 80.40219116210938, + "learning_rate": 2.634838798591369e-05, + "loss": 2.8004, + "step": 4565000 + }, + { + "epoch": 1.4192521531256654, + "grad_norm": 11.68047046661377, + "learning_rate": 2.634579744790558e-05, + "loss": 2.7875, + "step": 4565500 + }, + { + "epoch": 1.4194075854061523, + "grad_norm": 11.27332878112793, + "learning_rate": 2.634320690989746e-05, + "loss": 2.8411, + "step": 4566000 + }, + { + "epoch": 1.4195630176866392, + "grad_norm": 9.80712604522705, + "learning_rate": 2.6340616371889348e-05, + "loss": 2.8189, + "step": 4566500 + }, + { + "epoch": 1.419718449967126, + "grad_norm": 8.997063636779785, + "learning_rate": 2.6338025833881235e-05, + "loss": 2.7704, + "step": 4567000 + }, + { + "epoch": 1.419873882247613, + "grad_norm": 7.885993480682373, + "learning_rate": 2.633543529587312e-05, + "loss": 2.8116, + "step": 4567500 + }, + { + "epoch": 1.4200293145280998, + "grad_norm": 9.356708526611328, + "learning_rate": 2.6332844757865006e-05, + "loss": 2.8178, + "step": 4568000 + }, + { + "epoch": 1.4201847468085866, + "grad_norm": 10.261605262756348, + "learning_rate": 2.6330254219856887e-05, + "loss": 2.7333, + "step": 4568500 + }, + { + "epoch": 1.4203401790890735, + "grad_norm": 26.184391021728516, + "learning_rate": 2.6327663681848774e-05, + "loss": 2.8053, + "step": 4569000 + }, + { + "epoch": 1.4204956113695604, + "grad_norm": 8.653728485107422, + "learning_rate": 2.632507314384066e-05, + "loss": 2.8143, + "step": 4569500 + }, + { + "epoch": 1.4206510436500475, + "grad_norm": 12.07571792602539, + "learning_rate": 2.6322482605832545e-05, + "loss": 2.753, + "step": 4570000 + }, + { + "epoch": 1.4208064759305343, + "grad_norm": 9.99440860748291, + "learning_rate": 2.6319892067824432e-05, + "loss": 2.8115, + "step": 4570500 + }, + { + "epoch": 1.4209619082110212, + "grad_norm": 8.697677612304688, + "learning_rate": 2.631730152981632e-05, + "loss": 2.7741, + "step": 4571000 + }, + { + "epoch": 1.421117340491508, + "grad_norm": 9.985825538635254, + "learning_rate": 2.63147109918082e-05, + "loss": 2.7524, + "step": 4571500 + }, + { + "epoch": 1.421272772771995, + "grad_norm": 9.528422355651855, + "learning_rate": 2.6312120453800087e-05, + "loss": 2.8004, + "step": 4572000 + }, + { + "epoch": 1.4214282050524818, + "grad_norm": 7.9118547439575195, + "learning_rate": 2.6309529915791974e-05, + "loss": 2.7927, + "step": 4572500 + }, + { + "epoch": 1.4215836373329687, + "grad_norm": 10.120991706848145, + "learning_rate": 2.6306939377783857e-05, + "loss": 2.7846, + "step": 4573000 + }, + { + "epoch": 1.4217390696134555, + "grad_norm": 8.589073181152344, + "learning_rate": 2.6304348839775745e-05, + "loss": 2.7744, + "step": 4573500 + }, + { + "epoch": 1.4218945018939424, + "grad_norm": 11.332347869873047, + "learning_rate": 2.6301758301767632e-05, + "loss": 2.8224, + "step": 4574000 + }, + { + "epoch": 1.4220499341744293, + "grad_norm": 9.763421058654785, + "learning_rate": 2.6299167763759512e-05, + "loss": 2.8102, + "step": 4574500 + }, + { + "epoch": 1.4222053664549161, + "grad_norm": 10.782401084899902, + "learning_rate": 2.62965772257514e-05, + "loss": 2.7746, + "step": 4575000 + }, + { + "epoch": 1.422360798735403, + "grad_norm": 7.301127910614014, + "learning_rate": 2.6293986687743283e-05, + "loss": 2.7954, + "step": 4575500 + }, + { + "epoch": 1.4225162310158899, + "grad_norm": 8.356568336486816, + "learning_rate": 2.629139614973517e-05, + "loss": 2.7718, + "step": 4576000 + }, + { + "epoch": 1.4226716632963767, + "grad_norm": 9.1626615524292, + "learning_rate": 2.6288805611727057e-05, + "loss": 2.7699, + "step": 4576500 + }, + { + "epoch": 1.4228270955768636, + "grad_norm": 9.427963256835938, + "learning_rate": 2.6286215073718938e-05, + "loss": 2.7896, + "step": 4577000 + }, + { + "epoch": 1.4229825278573505, + "grad_norm": 9.99435043334961, + "learning_rate": 2.628362453571083e-05, + "loss": 2.7722, + "step": 4577500 + }, + { + "epoch": 1.4231379601378373, + "grad_norm": 7.531030654907227, + "learning_rate": 2.6281033997702715e-05, + "loss": 2.8775, + "step": 4578000 + }, + { + "epoch": 1.4232933924183242, + "grad_norm": 7.105301856994629, + "learning_rate": 2.6278443459694596e-05, + "loss": 2.8457, + "step": 4578500 + }, + { + "epoch": 1.423448824698811, + "grad_norm": 6.360909461975098, + "learning_rate": 2.6275852921686483e-05, + "loss": 2.8171, + "step": 4579000 + }, + { + "epoch": 1.423604256979298, + "grad_norm": 26.073556900024414, + "learning_rate": 2.627326238367837e-05, + "loss": 2.7566, + "step": 4579500 + }, + { + "epoch": 1.4237596892597848, + "grad_norm": 65.26721954345703, + "learning_rate": 2.6270671845670254e-05, + "loss": 2.7772, + "step": 4580000 + }, + { + "epoch": 1.4239151215402717, + "grad_norm": 11.441094398498535, + "learning_rate": 2.626808130766214e-05, + "loss": 2.763, + "step": 4580500 + }, + { + "epoch": 1.4240705538207585, + "grad_norm": 8.455944061279297, + "learning_rate": 2.626549076965402e-05, + "loss": 2.8344, + "step": 4581000 + }, + { + "epoch": 1.4242259861012454, + "grad_norm": 8.465507507324219, + "learning_rate": 2.626290023164591e-05, + "loss": 2.837, + "step": 4581500 + }, + { + "epoch": 1.4243814183817323, + "grad_norm": 11.495062828063965, + "learning_rate": 2.6260309693637796e-05, + "loss": 2.7387, + "step": 4582000 + }, + { + "epoch": 1.4245368506622191, + "grad_norm": 9.162836074829102, + "learning_rate": 2.625771915562968e-05, + "loss": 2.8144, + "step": 4582500 + }, + { + "epoch": 1.424692282942706, + "grad_norm": 24.46886444091797, + "learning_rate": 2.6255128617621567e-05, + "loss": 2.8043, + "step": 4583000 + }, + { + "epoch": 1.4248477152231929, + "grad_norm": 35.99898147583008, + "learning_rate": 2.6252538079613454e-05, + "loss": 2.787, + "step": 4583500 + }, + { + "epoch": 1.4250031475036797, + "grad_norm": 10.814208030700684, + "learning_rate": 2.6249947541605334e-05, + "loss": 2.7977, + "step": 4584000 + }, + { + "epoch": 1.4251585797841666, + "grad_norm": 9.322745323181152, + "learning_rate": 2.624735700359722e-05, + "loss": 2.7982, + "step": 4584500 + }, + { + "epoch": 1.4253140120646537, + "grad_norm": 20.061315536499023, + "learning_rate": 2.624476646558911e-05, + "loss": 2.8153, + "step": 4585000 + }, + { + "epoch": 1.4254694443451406, + "grad_norm": 14.574007034301758, + "learning_rate": 2.6242175927580992e-05, + "loss": 2.797, + "step": 4585500 + }, + { + "epoch": 1.4256248766256274, + "grad_norm": 26.0188045501709, + "learning_rate": 2.623958538957288e-05, + "loss": 2.8198, + "step": 4586000 + }, + { + "epoch": 1.4257803089061143, + "grad_norm": 9.787434577941895, + "learning_rate": 2.623699485156476e-05, + "loss": 2.7879, + "step": 4586500 + }, + { + "epoch": 1.4259357411866012, + "grad_norm": 10.349115371704102, + "learning_rate": 2.6234404313556647e-05, + "loss": 2.8004, + "step": 4587000 + }, + { + "epoch": 1.426091173467088, + "grad_norm": 12.3941068649292, + "learning_rate": 2.6231813775548538e-05, + "loss": 2.8367, + "step": 4587500 + }, + { + "epoch": 1.426246605747575, + "grad_norm": 12.406582832336426, + "learning_rate": 2.6229223237540418e-05, + "loss": 2.7902, + "step": 4588000 + }, + { + "epoch": 1.4264020380280618, + "grad_norm": 12.153340339660645, + "learning_rate": 2.6226632699532305e-05, + "loss": 2.7992, + "step": 4588500 + }, + { + "epoch": 1.4265574703085486, + "grad_norm": 11.702202796936035, + "learning_rate": 2.6224042161524192e-05, + "loss": 2.8198, + "step": 4589000 + }, + { + "epoch": 1.4267129025890355, + "grad_norm": 48.349205017089844, + "learning_rate": 2.6221451623516076e-05, + "loss": 2.8535, + "step": 4589500 + }, + { + "epoch": 1.4268683348695224, + "grad_norm": 9.481316566467285, + "learning_rate": 2.6218861085507963e-05, + "loss": 2.7896, + "step": 4590000 + }, + { + "epoch": 1.4270237671500092, + "grad_norm": 13.230758666992188, + "learning_rate": 2.621627054749985e-05, + "loss": 2.774, + "step": 4590500 + }, + { + "epoch": 1.427179199430496, + "grad_norm": 7.896139621734619, + "learning_rate": 2.621368000949173e-05, + "loss": 2.8267, + "step": 4591000 + }, + { + "epoch": 1.427334631710983, + "grad_norm": 11.233466148376465, + "learning_rate": 2.6211089471483618e-05, + "loss": 2.7792, + "step": 4591500 + }, + { + "epoch": 1.4274900639914698, + "grad_norm": 11.534931182861328, + "learning_rate": 2.6208498933475505e-05, + "loss": 2.761, + "step": 4592000 + }, + { + "epoch": 1.4276454962719567, + "grad_norm": 10.114480018615723, + "learning_rate": 2.620590839546739e-05, + "loss": 2.7864, + "step": 4592500 + }, + { + "epoch": 1.4278009285524436, + "grad_norm": 14.786604881286621, + "learning_rate": 2.6203317857459276e-05, + "loss": 2.8105, + "step": 4593000 + }, + { + "epoch": 1.4279563608329304, + "grad_norm": 8.554396629333496, + "learning_rate": 2.6200727319451156e-05, + "loss": 2.8283, + "step": 4593500 + }, + { + "epoch": 1.4281117931134175, + "grad_norm": 10.521141052246094, + "learning_rate": 2.6198136781443043e-05, + "loss": 2.807, + "step": 4594000 + }, + { + "epoch": 1.4282672253939044, + "grad_norm": 7.708629608154297, + "learning_rate": 2.619554624343493e-05, + "loss": 2.8108, + "step": 4594500 + }, + { + "epoch": 1.4284226576743912, + "grad_norm": 9.073185920715332, + "learning_rate": 2.6192955705426814e-05, + "loss": 2.8361, + "step": 4595000 + }, + { + "epoch": 1.4285780899548781, + "grad_norm": 10.577441215515137, + "learning_rate": 2.61903651674187e-05, + "loss": 2.8141, + "step": 4595500 + }, + { + "epoch": 1.428733522235365, + "grad_norm": 11.405040740966797, + "learning_rate": 2.618777462941059e-05, + "loss": 2.7823, + "step": 4596000 + }, + { + "epoch": 1.4288889545158519, + "grad_norm": 8.920934677124023, + "learning_rate": 2.618518409140247e-05, + "loss": 2.8068, + "step": 4596500 + }, + { + "epoch": 1.4290443867963387, + "grad_norm": 10.613773345947266, + "learning_rate": 2.6182593553394356e-05, + "loss": 2.8051, + "step": 4597000 + }, + { + "epoch": 1.4291998190768256, + "grad_norm": 14.242118835449219, + "learning_rate": 2.6180003015386247e-05, + "loss": 2.7389, + "step": 4597500 + }, + { + "epoch": 1.4293552513573125, + "grad_norm": 8.877479553222656, + "learning_rate": 2.6177412477378127e-05, + "loss": 2.8175, + "step": 4598000 + }, + { + "epoch": 1.4295106836377993, + "grad_norm": 10.56131649017334, + "learning_rate": 2.6174821939370014e-05, + "loss": 2.7584, + "step": 4598500 + }, + { + "epoch": 1.4296661159182862, + "grad_norm": 8.969587326049805, + "learning_rate": 2.6172231401361898e-05, + "loss": 2.7829, + "step": 4599000 + }, + { + "epoch": 1.429821548198773, + "grad_norm": 10.40592098236084, + "learning_rate": 2.6169640863353785e-05, + "loss": 2.7738, + "step": 4599500 + }, + { + "epoch": 1.42997698047926, + "grad_norm": 9.997756958007812, + "learning_rate": 2.6167050325345672e-05, + "loss": 2.7867, + "step": 4600000 + }, + { + "epoch": 1.4301324127597468, + "grad_norm": 8.591745376586914, + "learning_rate": 2.6164459787337553e-05, + "loss": 2.7824, + "step": 4600500 + }, + { + "epoch": 1.4302878450402337, + "grad_norm": 8.011302947998047, + "learning_rate": 2.616186924932944e-05, + "loss": 2.815, + "step": 4601000 + }, + { + "epoch": 1.4304432773207205, + "grad_norm": 7.779779434204102, + "learning_rate": 2.6159278711321327e-05, + "loss": 2.787, + "step": 4601500 + }, + { + "epoch": 1.4305987096012074, + "grad_norm": 8.459696769714355, + "learning_rate": 2.615668817331321e-05, + "loss": 2.7823, + "step": 4602000 + }, + { + "epoch": 1.4307541418816943, + "grad_norm": 10.941848754882812, + "learning_rate": 2.6154097635305098e-05, + "loss": 2.8303, + "step": 4602500 + }, + { + "epoch": 1.4309095741621811, + "grad_norm": 9.422835350036621, + "learning_rate": 2.6151507097296985e-05, + "loss": 2.7578, + "step": 4603000 + }, + { + "epoch": 1.431065006442668, + "grad_norm": 11.2805814743042, + "learning_rate": 2.6148916559288865e-05, + "loss": 2.7881, + "step": 4603500 + }, + { + "epoch": 1.4312204387231549, + "grad_norm": 9.103318214416504, + "learning_rate": 2.6146326021280753e-05, + "loss": 2.7591, + "step": 4604000 + }, + { + "epoch": 1.4313758710036417, + "grad_norm": 25.172269821166992, + "learning_rate": 2.6143735483272636e-05, + "loss": 2.818, + "step": 4604500 + }, + { + "epoch": 1.4315313032841286, + "grad_norm": 18.872028350830078, + "learning_rate": 2.6141144945264524e-05, + "loss": 2.7822, + "step": 4605000 + }, + { + "epoch": 1.4316867355646155, + "grad_norm": 11.733381271362305, + "learning_rate": 2.613855440725641e-05, + "loss": 2.8076, + "step": 4605500 + }, + { + "epoch": 1.4318421678451023, + "grad_norm": 10.484951972961426, + "learning_rate": 2.613596386924829e-05, + "loss": 2.6937, + "step": 4606000 + }, + { + "epoch": 1.4319976001255892, + "grad_norm": 15.923379898071289, + "learning_rate": 2.6133373331240178e-05, + "loss": 2.7568, + "step": 4606500 + }, + { + "epoch": 1.432153032406076, + "grad_norm": 10.387076377868652, + "learning_rate": 2.6130782793232065e-05, + "loss": 2.8252, + "step": 4607000 + }, + { + "epoch": 1.432308464686563, + "grad_norm": 8.218607902526855, + "learning_rate": 2.612819225522395e-05, + "loss": 2.8018, + "step": 4607500 + }, + { + "epoch": 1.4324638969670498, + "grad_norm": 8.3123779296875, + "learning_rate": 2.6125601717215836e-05, + "loss": 2.8495, + "step": 4608000 + }, + { + "epoch": 1.4326193292475367, + "grad_norm": 12.16081714630127, + "learning_rate": 2.6123011179207723e-05, + "loss": 2.8367, + "step": 4608500 + }, + { + "epoch": 1.4327747615280237, + "grad_norm": 15.701761245727539, + "learning_rate": 2.6120420641199607e-05, + "loss": 2.8162, + "step": 4609000 + }, + { + "epoch": 1.4329301938085106, + "grad_norm": 11.775442123413086, + "learning_rate": 2.6117830103191494e-05, + "loss": 2.8158, + "step": 4609500 + }, + { + "epoch": 1.4330856260889975, + "grad_norm": 13.258033752441406, + "learning_rate": 2.611523956518338e-05, + "loss": 2.8197, + "step": 4610000 + }, + { + "epoch": 1.4332410583694843, + "grad_norm": 9.826074600219727, + "learning_rate": 2.6112649027175262e-05, + "loss": 2.7739, + "step": 4610500 + }, + { + "epoch": 1.4333964906499712, + "grad_norm": 11.188642501831055, + "learning_rate": 2.611005848916715e-05, + "loss": 2.7633, + "step": 4611000 + }, + { + "epoch": 1.433551922930458, + "grad_norm": 10.69668197631836, + "learning_rate": 2.6107467951159033e-05, + "loss": 2.822, + "step": 4611500 + }, + { + "epoch": 1.433707355210945, + "grad_norm": 9.276731491088867, + "learning_rate": 2.610487741315092e-05, + "loss": 2.8128, + "step": 4612000 + }, + { + "epoch": 1.4338627874914318, + "grad_norm": 14.93817138671875, + "learning_rate": 2.6102286875142807e-05, + "loss": 2.7775, + "step": 4612500 + }, + { + "epoch": 1.4340182197719187, + "grad_norm": 7.050797939300537, + "learning_rate": 2.6099696337134687e-05, + "loss": 2.7901, + "step": 4613000 + }, + { + "epoch": 1.4341736520524055, + "grad_norm": 40.02067184448242, + "learning_rate": 2.6097105799126575e-05, + "loss": 2.7712, + "step": 4613500 + }, + { + "epoch": 1.4343290843328924, + "grad_norm": 8.719022750854492, + "learning_rate": 2.6094515261118462e-05, + "loss": 2.7794, + "step": 4614000 + }, + { + "epoch": 1.4344845166133793, + "grad_norm": 9.066353797912598, + "learning_rate": 2.6091924723110346e-05, + "loss": 2.8075, + "step": 4614500 + }, + { + "epoch": 1.4346399488938661, + "grad_norm": 9.675532341003418, + "learning_rate": 2.6089334185102233e-05, + "loss": 2.7745, + "step": 4615000 + }, + { + "epoch": 1.434795381174353, + "grad_norm": 10.5717191696167, + "learning_rate": 2.608674364709412e-05, + "loss": 2.8171, + "step": 4615500 + }, + { + "epoch": 1.4349508134548399, + "grad_norm": 11.039107322692871, + "learning_rate": 2.6084153109086e-05, + "loss": 2.7439, + "step": 4616000 + }, + { + "epoch": 1.4351062457353267, + "grad_norm": 9.199071884155273, + "learning_rate": 2.6081562571077887e-05, + "loss": 2.7837, + "step": 4616500 + }, + { + "epoch": 1.4352616780158136, + "grad_norm": 7.999239444732666, + "learning_rate": 2.607897203306977e-05, + "loss": 2.7897, + "step": 4617000 + }, + { + "epoch": 1.4354171102963005, + "grad_norm": 10.03913402557373, + "learning_rate": 2.6076381495061658e-05, + "loss": 2.8361, + "step": 4617500 + }, + { + "epoch": 1.4355725425767876, + "grad_norm": 6.251304626464844, + "learning_rate": 2.6073790957053545e-05, + "loss": 2.7358, + "step": 4618000 + }, + { + "epoch": 1.4357279748572744, + "grad_norm": 9.762558937072754, + "learning_rate": 2.6071200419045426e-05, + "loss": 2.8151, + "step": 4618500 + }, + { + "epoch": 1.4358834071377613, + "grad_norm": 10.697749137878418, + "learning_rate": 2.6068609881037316e-05, + "loss": 2.7886, + "step": 4619000 + }, + { + "epoch": 1.4360388394182482, + "grad_norm": 7.79780912399292, + "learning_rate": 2.6066019343029204e-05, + "loss": 2.7979, + "step": 4619500 + }, + { + "epoch": 1.436194271698735, + "grad_norm": 8.559819221496582, + "learning_rate": 2.6063428805021084e-05, + "loss": 2.7795, + "step": 4620000 + }, + { + "epoch": 1.436349703979222, + "grad_norm": 8.729842185974121, + "learning_rate": 2.606083826701297e-05, + "loss": 2.8304, + "step": 4620500 + }, + { + "epoch": 1.4365051362597088, + "grad_norm": 12.318072319030762, + "learning_rate": 2.6058247729004858e-05, + "loss": 2.775, + "step": 4621000 + }, + { + "epoch": 1.4366605685401956, + "grad_norm": 10.211541175842285, + "learning_rate": 2.6055657190996742e-05, + "loss": 2.7826, + "step": 4621500 + }, + { + "epoch": 1.4368160008206825, + "grad_norm": 8.053129196166992, + "learning_rate": 2.605306665298863e-05, + "loss": 2.7769, + "step": 4622000 + }, + { + "epoch": 1.4369714331011694, + "grad_norm": 9.514410972595215, + "learning_rate": 2.605047611498051e-05, + "loss": 2.794, + "step": 4622500 + }, + { + "epoch": 1.4371268653816562, + "grad_norm": 10.265701293945312, + "learning_rate": 2.6047885576972397e-05, + "loss": 2.7752, + "step": 4623000 + }, + { + "epoch": 1.437282297662143, + "grad_norm": 10.669422149658203, + "learning_rate": 2.6045295038964284e-05, + "loss": 2.7782, + "step": 4623500 + }, + { + "epoch": 1.43743772994263, + "grad_norm": 8.726973533630371, + "learning_rate": 2.6042704500956168e-05, + "loss": 2.7504, + "step": 4624000 + }, + { + "epoch": 1.4375931622231168, + "grad_norm": 9.77680492401123, + "learning_rate": 2.6040113962948055e-05, + "loss": 2.8498, + "step": 4624500 + }, + { + "epoch": 1.4377485945036037, + "grad_norm": 8.660392761230469, + "learning_rate": 2.6037523424939942e-05, + "loss": 2.7701, + "step": 4625000 + }, + { + "epoch": 1.4379040267840906, + "grad_norm": 10.368542671203613, + "learning_rate": 2.6034932886931822e-05, + "loss": 2.7528, + "step": 4625500 + }, + { + "epoch": 1.4380594590645774, + "grad_norm": 10.825652122497559, + "learning_rate": 2.603234234892371e-05, + "loss": 2.8135, + "step": 4626000 + }, + { + "epoch": 1.4382148913450643, + "grad_norm": 11.217453956604004, + "learning_rate": 2.6029751810915597e-05, + "loss": 2.7832, + "step": 4626500 + }, + { + "epoch": 1.4383703236255512, + "grad_norm": 10.108907699584961, + "learning_rate": 2.602716127290748e-05, + "loss": 2.7388, + "step": 4627000 + }, + { + "epoch": 1.438525755906038, + "grad_norm": 8.416417121887207, + "learning_rate": 2.6024570734899367e-05, + "loss": 2.7959, + "step": 4627500 + }, + { + "epoch": 1.438681188186525, + "grad_norm": 13.201555252075195, + "learning_rate": 2.6021980196891255e-05, + "loss": 2.7921, + "step": 4628000 + }, + { + "epoch": 1.4388366204670118, + "grad_norm": 7.310007095336914, + "learning_rate": 2.6019389658883135e-05, + "loss": 2.77, + "step": 4628500 + }, + { + "epoch": 1.4389920527474986, + "grad_norm": 7.951028347015381, + "learning_rate": 2.6016799120875026e-05, + "loss": 2.7993, + "step": 4629000 + }, + { + "epoch": 1.4391474850279855, + "grad_norm": 11.151131629943848, + "learning_rate": 2.6014208582866906e-05, + "loss": 2.7745, + "step": 4629500 + }, + { + "epoch": 1.4393029173084724, + "grad_norm": 11.268906593322754, + "learning_rate": 2.6011618044858793e-05, + "loss": 2.7753, + "step": 4630000 + }, + { + "epoch": 1.4394583495889592, + "grad_norm": 8.383987426757812, + "learning_rate": 2.600902750685068e-05, + "loss": 2.7788, + "step": 4630500 + }, + { + "epoch": 1.439613781869446, + "grad_norm": 7.6107282638549805, + "learning_rate": 2.6006436968842564e-05, + "loss": 2.8039, + "step": 4631000 + }, + { + "epoch": 1.439769214149933, + "grad_norm": 7.708819389343262, + "learning_rate": 2.600384643083445e-05, + "loss": 2.7652, + "step": 4631500 + }, + { + "epoch": 1.4399246464304198, + "grad_norm": 9.844725608825684, + "learning_rate": 2.6001255892826338e-05, + "loss": 2.8093, + "step": 4632000 + }, + { + "epoch": 1.4400800787109067, + "grad_norm": 50.55385208129883, + "learning_rate": 2.599866535481822e-05, + "loss": 2.7858, + "step": 4632500 + }, + { + "epoch": 1.4402355109913938, + "grad_norm": 8.359232902526855, + "learning_rate": 2.5996074816810106e-05, + "loss": 2.7685, + "step": 4633000 + }, + { + "epoch": 1.4403909432718807, + "grad_norm": 12.390947341918945, + "learning_rate": 2.5993484278801993e-05, + "loss": 2.8148, + "step": 4633500 + }, + { + "epoch": 1.4405463755523675, + "grad_norm": 23.405601501464844, + "learning_rate": 2.5990893740793877e-05, + "loss": 2.7736, + "step": 4634000 + }, + { + "epoch": 1.4407018078328544, + "grad_norm": 8.593560218811035, + "learning_rate": 2.5988303202785764e-05, + "loss": 2.8588, + "step": 4634500 + }, + { + "epoch": 1.4408572401133413, + "grad_norm": 18.938899993896484, + "learning_rate": 2.5985712664777644e-05, + "loss": 2.8102, + "step": 4635000 + }, + { + "epoch": 1.4410126723938281, + "grad_norm": 11.047531127929688, + "learning_rate": 2.598312212676953e-05, + "loss": 2.8112, + "step": 4635500 + }, + { + "epoch": 1.441168104674315, + "grad_norm": 8.485608100891113, + "learning_rate": 2.598053158876142e-05, + "loss": 2.7869, + "step": 4636000 + }, + { + "epoch": 1.4413235369548019, + "grad_norm": 6.639595031738281, + "learning_rate": 2.5977941050753302e-05, + "loss": 2.7545, + "step": 4636500 + }, + { + "epoch": 1.4414789692352887, + "grad_norm": 8.266875267028809, + "learning_rate": 2.597535051274519e-05, + "loss": 2.8255, + "step": 4637000 + }, + { + "epoch": 1.4416344015157756, + "grad_norm": 10.20842456817627, + "learning_rate": 2.5972759974737077e-05, + "loss": 2.805, + "step": 4637500 + }, + { + "epoch": 1.4417898337962625, + "grad_norm": 9.281570434570312, + "learning_rate": 2.5970169436728957e-05, + "loss": 2.8047, + "step": 4638000 + }, + { + "epoch": 1.4419452660767493, + "grad_norm": 10.034026145935059, + "learning_rate": 2.5967578898720844e-05, + "loss": 2.7559, + "step": 4638500 + }, + { + "epoch": 1.4421006983572362, + "grad_norm": 10.345827102661133, + "learning_rate": 2.5964988360712735e-05, + "loss": 2.7756, + "step": 4639000 + }, + { + "epoch": 1.442256130637723, + "grad_norm": 8.136220932006836, + "learning_rate": 2.5962397822704615e-05, + "loss": 2.7603, + "step": 4639500 + }, + { + "epoch": 1.44241156291821, + "grad_norm": 8.553062438964844, + "learning_rate": 2.5959807284696502e-05, + "loss": 2.791, + "step": 4640000 + }, + { + "epoch": 1.4425669951986968, + "grad_norm": 10.892875671386719, + "learning_rate": 2.5957216746688383e-05, + "loss": 2.7847, + "step": 4640500 + }, + { + "epoch": 1.4427224274791837, + "grad_norm": 13.57990837097168, + "learning_rate": 2.5954626208680273e-05, + "loss": 2.8401, + "step": 4641000 + }, + { + "epoch": 1.4428778597596705, + "grad_norm": 9.184260368347168, + "learning_rate": 2.595203567067216e-05, + "loss": 2.7646, + "step": 4641500 + }, + { + "epoch": 1.4430332920401576, + "grad_norm": 11.946491241455078, + "learning_rate": 2.594944513266404e-05, + "loss": 2.8213, + "step": 4642000 + }, + { + "epoch": 1.4431887243206445, + "grad_norm": 11.63405704498291, + "learning_rate": 2.5946854594655928e-05, + "loss": 2.7584, + "step": 4642500 + }, + { + "epoch": 1.4433441566011314, + "grad_norm": 8.363919258117676, + "learning_rate": 2.5944264056647815e-05, + "loss": 2.7848, + "step": 4643000 + }, + { + "epoch": 1.4434995888816182, + "grad_norm": 11.898460388183594, + "learning_rate": 2.59416735186397e-05, + "loss": 2.7895, + "step": 4643500 + }, + { + "epoch": 1.443655021162105, + "grad_norm": 7.70801305770874, + "learning_rate": 2.5939082980631586e-05, + "loss": 2.8176, + "step": 4644000 + }, + { + "epoch": 1.443810453442592, + "grad_norm": 8.396252632141113, + "learning_rate": 2.5936492442623473e-05, + "loss": 2.7984, + "step": 4644500 + }, + { + "epoch": 1.4439658857230788, + "grad_norm": 8.171467781066895, + "learning_rate": 2.5933901904615353e-05, + "loss": 2.8143, + "step": 4645000 + }, + { + "epoch": 1.4441213180035657, + "grad_norm": 10.281352043151855, + "learning_rate": 2.593131136660724e-05, + "loss": 2.7956, + "step": 4645500 + }, + { + "epoch": 1.4442767502840526, + "grad_norm": 9.043387413024902, + "learning_rate": 2.5928720828599128e-05, + "loss": 2.8182, + "step": 4646000 + }, + { + "epoch": 1.4444321825645394, + "grad_norm": 10.838057518005371, + "learning_rate": 2.592613029059101e-05, + "loss": 2.8179, + "step": 4646500 + }, + { + "epoch": 1.4445876148450263, + "grad_norm": 9.70473861694336, + "learning_rate": 2.59235397525829e-05, + "loss": 2.7551, + "step": 4647000 + }, + { + "epoch": 1.4447430471255132, + "grad_norm": 10.876824378967285, + "learning_rate": 2.592094921457478e-05, + "loss": 2.8203, + "step": 4647500 + }, + { + "epoch": 1.444898479406, + "grad_norm": 9.059392929077148, + "learning_rate": 2.5918358676566666e-05, + "loss": 2.8025, + "step": 4648000 + }, + { + "epoch": 1.445053911686487, + "grad_norm": 19.91460418701172, + "learning_rate": 2.5915768138558553e-05, + "loss": 2.8226, + "step": 4648500 + }, + { + "epoch": 1.4452093439669738, + "grad_norm": 8.303434371948242, + "learning_rate": 2.5913177600550437e-05, + "loss": 2.7387, + "step": 4649000 + }, + { + "epoch": 1.4453647762474606, + "grad_norm": 8.545709609985352, + "learning_rate": 2.5910587062542324e-05, + "loss": 2.7917, + "step": 4649500 + }, + { + "epoch": 1.4455202085279475, + "grad_norm": 8.453140258789062, + "learning_rate": 2.590799652453421e-05, + "loss": 2.8089, + "step": 4650000 + }, + { + "epoch": 1.4456756408084344, + "grad_norm": 17.374788284301758, + "learning_rate": 2.5905405986526092e-05, + "loss": 2.8057, + "step": 4650500 + }, + { + "epoch": 1.4458310730889212, + "grad_norm": 10.673272132873535, + "learning_rate": 2.5902815448517982e-05, + "loss": 2.757, + "step": 4651000 + }, + { + "epoch": 1.445986505369408, + "grad_norm": 7.886049270629883, + "learning_rate": 2.590022491050987e-05, + "loss": 2.7691, + "step": 4651500 + }, + { + "epoch": 1.446141937649895, + "grad_norm": 7.498291015625, + "learning_rate": 2.589763437250175e-05, + "loss": 2.7934, + "step": 4652000 + }, + { + "epoch": 1.4462973699303818, + "grad_norm": 7.7043938636779785, + "learning_rate": 2.5895043834493637e-05, + "loss": 2.8122, + "step": 4652500 + }, + { + "epoch": 1.4464528022108687, + "grad_norm": 10.43281364440918, + "learning_rate": 2.589245329648552e-05, + "loss": 2.7625, + "step": 4653000 + }, + { + "epoch": 1.4466082344913556, + "grad_norm": 9.003466606140137, + "learning_rate": 2.5889862758477408e-05, + "loss": 2.7652, + "step": 4653500 + }, + { + "epoch": 1.4467636667718424, + "grad_norm": 11.435949325561523, + "learning_rate": 2.5887272220469295e-05, + "loss": 2.7878, + "step": 4654000 + }, + { + "epoch": 1.4469190990523293, + "grad_norm": 10.602933883666992, + "learning_rate": 2.5884681682461176e-05, + "loss": 2.8378, + "step": 4654500 + }, + { + "epoch": 1.4470745313328162, + "grad_norm": 9.04488754272461, + "learning_rate": 2.5882091144453063e-05, + "loss": 2.7517, + "step": 4655000 + }, + { + "epoch": 1.447229963613303, + "grad_norm": 8.887311935424805, + "learning_rate": 2.587950060644495e-05, + "loss": 2.828, + "step": 4655500 + }, + { + "epoch": 1.44738539589379, + "grad_norm": 8.345609664916992, + "learning_rate": 2.5876910068436834e-05, + "loss": 2.7797, + "step": 4656000 + }, + { + "epoch": 1.4475408281742768, + "grad_norm": 12.163595199584961, + "learning_rate": 2.587431953042872e-05, + "loss": 2.7833, + "step": 4656500 + }, + { + "epoch": 1.4476962604547636, + "grad_norm": 8.033475875854492, + "learning_rate": 2.5871728992420608e-05, + "loss": 2.7571, + "step": 4657000 + }, + { + "epoch": 1.4478516927352507, + "grad_norm": 10.998634338378906, + "learning_rate": 2.5869138454412488e-05, + "loss": 2.8175, + "step": 4657500 + }, + { + "epoch": 1.4480071250157376, + "grad_norm": 10.774785041809082, + "learning_rate": 2.5866547916404375e-05, + "loss": 2.7735, + "step": 4658000 + }, + { + "epoch": 1.4481625572962245, + "grad_norm": 60.96044158935547, + "learning_rate": 2.5863957378396263e-05, + "loss": 2.837, + "step": 4658500 + }, + { + "epoch": 1.4483179895767113, + "grad_norm": 9.079150199890137, + "learning_rate": 2.5861366840388146e-05, + "loss": 2.8217, + "step": 4659000 + }, + { + "epoch": 1.4484734218571982, + "grad_norm": 7.856088638305664, + "learning_rate": 2.5858776302380033e-05, + "loss": 2.7582, + "step": 4659500 + }, + { + "epoch": 1.448628854137685, + "grad_norm": 9.602023124694824, + "learning_rate": 2.5856185764371914e-05, + "loss": 2.7921, + "step": 4660000 + }, + { + "epoch": 1.448784286418172, + "grad_norm": 9.638696670532227, + "learning_rate": 2.58535952263638e-05, + "loss": 2.8384, + "step": 4660500 + }, + { + "epoch": 1.4489397186986588, + "grad_norm": 8.701981544494629, + "learning_rate": 2.585100468835569e-05, + "loss": 2.7601, + "step": 4661000 + }, + { + "epoch": 1.4490951509791457, + "grad_norm": 9.35268783569336, + "learning_rate": 2.5848414150347572e-05, + "loss": 2.793, + "step": 4661500 + }, + { + "epoch": 1.4492505832596325, + "grad_norm": 10.251928329467773, + "learning_rate": 2.584582361233946e-05, + "loss": 2.7795, + "step": 4662000 + }, + { + "epoch": 1.4494060155401194, + "grad_norm": 12.008115768432617, + "learning_rate": 2.5843233074331346e-05, + "loss": 2.8032, + "step": 4662500 + }, + { + "epoch": 1.4495614478206063, + "grad_norm": 20.168869018554688, + "learning_rate": 2.584064253632323e-05, + "loss": 2.8288, + "step": 4663000 + }, + { + "epoch": 1.4497168801010931, + "grad_norm": 9.045636177062988, + "learning_rate": 2.5838051998315117e-05, + "loss": 2.8275, + "step": 4663500 + }, + { + "epoch": 1.44987231238158, + "grad_norm": 9.639267921447754, + "learning_rate": 2.5835461460307004e-05, + "loss": 2.8086, + "step": 4664000 + }, + { + "epoch": 1.4500277446620669, + "grad_norm": 10.973404884338379, + "learning_rate": 2.5832870922298885e-05, + "loss": 2.792, + "step": 4664500 + }, + { + "epoch": 1.4501831769425537, + "grad_norm": 8.500855445861816, + "learning_rate": 2.5830280384290772e-05, + "loss": 2.7581, + "step": 4665000 + }, + { + "epoch": 1.4503386092230406, + "grad_norm": 11.031729698181152, + "learning_rate": 2.5827689846282656e-05, + "loss": 2.7661, + "step": 4665500 + }, + { + "epoch": 1.4504940415035275, + "grad_norm": 11.448798179626465, + "learning_rate": 2.5825099308274543e-05, + "loss": 2.725, + "step": 4666000 + }, + { + "epoch": 1.4506494737840145, + "grad_norm": 9.078566551208496, + "learning_rate": 2.582250877026643e-05, + "loss": 2.7866, + "step": 4666500 + }, + { + "epoch": 1.4508049060645014, + "grad_norm": 10.140182495117188, + "learning_rate": 2.581991823225831e-05, + "loss": 2.7472, + "step": 4667000 + }, + { + "epoch": 1.4509603383449883, + "grad_norm": 9.209181785583496, + "learning_rate": 2.5817327694250197e-05, + "loss": 2.8496, + "step": 4667500 + }, + { + "epoch": 1.4511157706254751, + "grad_norm": 38.5648193359375, + "learning_rate": 2.5814737156242085e-05, + "loss": 2.7687, + "step": 4668000 + }, + { + "epoch": 1.451271202905962, + "grad_norm": 9.60317325592041, + "learning_rate": 2.581214661823397e-05, + "loss": 2.8218, + "step": 4668500 + }, + { + "epoch": 1.4514266351864489, + "grad_norm": 8.962075233459473, + "learning_rate": 2.5809556080225856e-05, + "loss": 2.8252, + "step": 4669000 + }, + { + "epoch": 1.4515820674669357, + "grad_norm": 5.718034744262695, + "learning_rate": 2.5806965542217743e-05, + "loss": 2.7865, + "step": 4669500 + }, + { + "epoch": 1.4517374997474226, + "grad_norm": 8.510781288146973, + "learning_rate": 2.5804375004209623e-05, + "loss": 2.8373, + "step": 4670000 + }, + { + "epoch": 1.4518929320279095, + "grad_norm": 8.418405532836914, + "learning_rate": 2.580178446620151e-05, + "loss": 2.8189, + "step": 4670500 + }, + { + "epoch": 1.4520483643083963, + "grad_norm": 6.9492692947387695, + "learning_rate": 2.5799193928193394e-05, + "loss": 2.7979, + "step": 4671000 + }, + { + "epoch": 1.4522037965888832, + "grad_norm": 68.83648681640625, + "learning_rate": 2.579660339018528e-05, + "loss": 2.8187, + "step": 4671500 + }, + { + "epoch": 1.45235922886937, + "grad_norm": 8.81164836883545, + "learning_rate": 2.5794012852177168e-05, + "loss": 2.7869, + "step": 4672000 + }, + { + "epoch": 1.452514661149857, + "grad_norm": 11.87602710723877, + "learning_rate": 2.579142231416905e-05, + "loss": 2.7877, + "step": 4672500 + }, + { + "epoch": 1.4526700934303438, + "grad_norm": 7.30971097946167, + "learning_rate": 2.578883177616094e-05, + "loss": 2.7598, + "step": 4673000 + }, + { + "epoch": 1.4528255257108307, + "grad_norm": 9.662322998046875, + "learning_rate": 2.5786241238152826e-05, + "loss": 2.7554, + "step": 4673500 + }, + { + "epoch": 1.4529809579913175, + "grad_norm": 18.850400924682617, + "learning_rate": 2.5783650700144707e-05, + "loss": 2.824, + "step": 4674000 + }, + { + "epoch": 1.4531363902718044, + "grad_norm": 10.001507759094238, + "learning_rate": 2.5781060162136594e-05, + "loss": 2.8239, + "step": 4674500 + }, + { + "epoch": 1.4532918225522913, + "grad_norm": 11.197190284729004, + "learning_rate": 2.577846962412848e-05, + "loss": 2.749, + "step": 4675000 + }, + { + "epoch": 1.4534472548327781, + "grad_norm": 8.719400405883789, + "learning_rate": 2.5775879086120365e-05, + "loss": 2.8275, + "step": 4675500 + }, + { + "epoch": 1.453602687113265, + "grad_norm": 11.60152530670166, + "learning_rate": 2.5773288548112252e-05, + "loss": 2.7829, + "step": 4676000 + }, + { + "epoch": 1.4537581193937519, + "grad_norm": 5.709870338439941, + "learning_rate": 2.577069801010414e-05, + "loss": 2.7794, + "step": 4676500 + }, + { + "epoch": 1.4539135516742387, + "grad_norm": 6.958976745605469, + "learning_rate": 2.576810747209602e-05, + "loss": 2.7475, + "step": 4677000 + }, + { + "epoch": 1.4540689839547256, + "grad_norm": 9.776273727416992, + "learning_rate": 2.5765516934087907e-05, + "loss": 2.7652, + "step": 4677500 + }, + { + "epoch": 1.4542244162352125, + "grad_norm": 11.491266250610352, + "learning_rate": 2.576292639607979e-05, + "loss": 2.8514, + "step": 4678000 + }, + { + "epoch": 1.4543798485156993, + "grad_norm": 11.711160659790039, + "learning_rate": 2.5760335858071678e-05, + "loss": 2.8115, + "step": 4678500 + }, + { + "epoch": 1.4545352807961862, + "grad_norm": 8.712011337280273, + "learning_rate": 2.5757745320063565e-05, + "loss": 2.7981, + "step": 4679000 + }, + { + "epoch": 1.454690713076673, + "grad_norm": 12.777524948120117, + "learning_rate": 2.5755154782055445e-05, + "loss": 2.8292, + "step": 4679500 + }, + { + "epoch": 1.45484614535716, + "grad_norm": 13.075356483459473, + "learning_rate": 2.5752564244047332e-05, + "loss": 2.7715, + "step": 4680000 + }, + { + "epoch": 1.4550015776376468, + "grad_norm": 9.222946166992188, + "learning_rate": 2.574997370603922e-05, + "loss": 2.785, + "step": 4680500 + }, + { + "epoch": 1.4551570099181337, + "grad_norm": 5.7285237312316895, + "learning_rate": 2.5747383168031103e-05, + "loss": 2.7731, + "step": 4681000 + }, + { + "epoch": 1.4553124421986208, + "grad_norm": 4.227281093597412, + "learning_rate": 2.574479263002299e-05, + "loss": 2.781, + "step": 4681500 + }, + { + "epoch": 1.4554678744791076, + "grad_norm": 11.299232482910156, + "learning_rate": 2.5742202092014877e-05, + "loss": 2.7584, + "step": 4682000 + }, + { + "epoch": 1.4556233067595945, + "grad_norm": 20.926555633544922, + "learning_rate": 2.5739611554006758e-05, + "loss": 2.8186, + "step": 4682500 + }, + { + "epoch": 1.4557787390400814, + "grad_norm": 9.474275588989258, + "learning_rate": 2.573702101599865e-05, + "loss": 2.7824, + "step": 4683000 + }, + { + "epoch": 1.4559341713205682, + "grad_norm": 7.905013561248779, + "learning_rate": 2.573443047799053e-05, + "loss": 2.7986, + "step": 4683500 + }, + { + "epoch": 1.456089603601055, + "grad_norm": 9.638097763061523, + "learning_rate": 2.5731839939982416e-05, + "loss": 2.7197, + "step": 4684000 + }, + { + "epoch": 1.456245035881542, + "grad_norm": 9.339818000793457, + "learning_rate": 2.5729249401974303e-05, + "loss": 2.7918, + "step": 4684500 + }, + { + "epoch": 1.4564004681620288, + "grad_norm": 8.031414031982422, + "learning_rate": 2.5726658863966187e-05, + "loss": 2.7679, + "step": 4685000 + }, + { + "epoch": 1.4565559004425157, + "grad_norm": 7.305665493011475, + "learning_rate": 2.5724068325958074e-05, + "loss": 2.7973, + "step": 4685500 + }, + { + "epoch": 1.4567113327230026, + "grad_norm": 9.144301414489746, + "learning_rate": 2.572147778794996e-05, + "loss": 2.8246, + "step": 4686000 + }, + { + "epoch": 1.4568667650034894, + "grad_norm": 37.518089294433594, + "learning_rate": 2.571888724994184e-05, + "loss": 2.7991, + "step": 4686500 + }, + { + "epoch": 1.4570221972839763, + "grad_norm": 16.315595626831055, + "learning_rate": 2.571629671193373e-05, + "loss": 2.7831, + "step": 4687000 + }, + { + "epoch": 1.4571776295644632, + "grad_norm": 7.842794895172119, + "learning_rate": 2.5713706173925616e-05, + "loss": 2.7973, + "step": 4687500 + }, + { + "epoch": 1.45733306184495, + "grad_norm": 9.316739082336426, + "learning_rate": 2.57111156359175e-05, + "loss": 2.7307, + "step": 4688000 + }, + { + "epoch": 1.457488494125437, + "grad_norm": 9.383125305175781, + "learning_rate": 2.5708525097909387e-05, + "loss": 2.8152, + "step": 4688500 + }, + { + "epoch": 1.4576439264059238, + "grad_norm": 8.48526668548584, + "learning_rate": 2.5705934559901267e-05, + "loss": 2.7855, + "step": 4689000 + }, + { + "epoch": 1.4577993586864106, + "grad_norm": 9.191156387329102, + "learning_rate": 2.5703344021893154e-05, + "loss": 2.7909, + "step": 4689500 + }, + { + "epoch": 1.4579547909668975, + "grad_norm": 31.650190353393555, + "learning_rate": 2.570075348388504e-05, + "loss": 2.7599, + "step": 4690000 + }, + { + "epoch": 1.4581102232473846, + "grad_norm": 10.346294403076172, + "learning_rate": 2.5698162945876925e-05, + "loss": 2.7764, + "step": 4690500 + }, + { + "epoch": 1.4582656555278715, + "grad_norm": 9.272714614868164, + "learning_rate": 2.5695572407868812e-05, + "loss": 2.7911, + "step": 4691000 + }, + { + "epoch": 1.4584210878083583, + "grad_norm": 9.841176986694336, + "learning_rate": 2.56929818698607e-05, + "loss": 2.8228, + "step": 4691500 + }, + { + "epoch": 1.4585765200888452, + "grad_norm": 12.662188529968262, + "learning_rate": 2.569039133185258e-05, + "loss": 2.7701, + "step": 4692000 + }, + { + "epoch": 1.458731952369332, + "grad_norm": 33.65372848510742, + "learning_rate": 2.5687800793844467e-05, + "loss": 2.7894, + "step": 4692500 + }, + { + "epoch": 1.458887384649819, + "grad_norm": 9.501016616821289, + "learning_rate": 2.5685210255836358e-05, + "loss": 2.7835, + "step": 4693000 + }, + { + "epoch": 1.4590428169303058, + "grad_norm": 9.022509574890137, + "learning_rate": 2.5682619717828238e-05, + "loss": 2.7488, + "step": 4693500 + }, + { + "epoch": 1.4591982492107927, + "grad_norm": 12.298375129699707, + "learning_rate": 2.5680029179820125e-05, + "loss": 2.7576, + "step": 4694000 + }, + { + "epoch": 1.4593536814912795, + "grad_norm": 9.560957908630371, + "learning_rate": 2.5677438641812012e-05, + "loss": 2.8431, + "step": 4694500 + }, + { + "epoch": 1.4595091137717664, + "grad_norm": 17.73150062561035, + "learning_rate": 2.5674848103803896e-05, + "loss": 2.843, + "step": 4695000 + }, + { + "epoch": 1.4596645460522533, + "grad_norm": 11.096833229064941, + "learning_rate": 2.5672257565795783e-05, + "loss": 2.7571, + "step": 4695500 + }, + { + "epoch": 1.4598199783327401, + "grad_norm": 8.337261199951172, + "learning_rate": 2.5669667027787664e-05, + "loss": 2.7865, + "step": 4696000 + }, + { + "epoch": 1.459975410613227, + "grad_norm": 12.325234413146973, + "learning_rate": 2.566707648977955e-05, + "loss": 2.7677, + "step": 4696500 + }, + { + "epoch": 1.4601308428937139, + "grad_norm": 10.872173309326172, + "learning_rate": 2.5664485951771438e-05, + "loss": 2.7628, + "step": 4697000 + }, + { + "epoch": 1.4602862751742007, + "grad_norm": 9.762747764587402, + "learning_rate": 2.566189541376332e-05, + "loss": 2.7635, + "step": 4697500 + }, + { + "epoch": 1.4604417074546876, + "grad_norm": 7.942526340484619, + "learning_rate": 2.565930487575521e-05, + "loss": 2.7921, + "step": 4698000 + }, + { + "epoch": 1.4605971397351745, + "grad_norm": 13.483667373657227, + "learning_rate": 2.5656714337747096e-05, + "loss": 2.7916, + "step": 4698500 + }, + { + "epoch": 1.4607525720156613, + "grad_norm": 10.852267265319824, + "learning_rate": 2.5654123799738976e-05, + "loss": 2.8704, + "step": 4699000 + }, + { + "epoch": 1.4609080042961482, + "grad_norm": 9.789371490478516, + "learning_rate": 2.5651533261730863e-05, + "loss": 2.8277, + "step": 4699500 + }, + { + "epoch": 1.461063436576635, + "grad_norm": 8.29953384399414, + "learning_rate": 2.564894272372275e-05, + "loss": 2.8162, + "step": 4700000 + }, + { + "epoch": 1.461218868857122, + "grad_norm": 20.177671432495117, + "learning_rate": 2.5646352185714634e-05, + "loss": 2.8274, + "step": 4700500 + }, + { + "epoch": 1.4613743011376088, + "grad_norm": 18.20463752746582, + "learning_rate": 2.564376164770652e-05, + "loss": 2.7858, + "step": 4701000 + }, + { + "epoch": 1.4615297334180957, + "grad_norm": 5.7246317863464355, + "learning_rate": 2.5641171109698402e-05, + "loss": 2.7746, + "step": 4701500 + }, + { + "epoch": 1.4616851656985825, + "grad_norm": 8.481738090515137, + "learning_rate": 2.563858057169029e-05, + "loss": 2.7858, + "step": 4702000 + }, + { + "epoch": 1.4618405979790694, + "grad_norm": 8.69205093383789, + "learning_rate": 2.5635990033682176e-05, + "loss": 2.7969, + "step": 4702500 + }, + { + "epoch": 1.4619960302595563, + "grad_norm": 12.636767387390137, + "learning_rate": 2.563339949567406e-05, + "loss": 2.8171, + "step": 4703000 + }, + { + "epoch": 1.4621514625400431, + "grad_norm": 8.095431327819824, + "learning_rate": 2.5630808957665947e-05, + "loss": 2.787, + "step": 4703500 + }, + { + "epoch": 1.46230689482053, + "grad_norm": 7.818783760070801, + "learning_rate": 2.5628218419657834e-05, + "loss": 2.7815, + "step": 4704000 + }, + { + "epoch": 1.4624623271010169, + "grad_norm": 9.245081901550293, + "learning_rate": 2.5625627881649718e-05, + "loss": 2.7357, + "step": 4704500 + }, + { + "epoch": 1.4626177593815037, + "grad_norm": 8.832633018493652, + "learning_rate": 2.5623037343641605e-05, + "loss": 2.7971, + "step": 4705000 + }, + { + "epoch": 1.4627731916619908, + "grad_norm": 9.058510780334473, + "learning_rate": 2.5620446805633492e-05, + "loss": 2.7954, + "step": 4705500 + }, + { + "epoch": 1.4629286239424777, + "grad_norm": 7.750351905822754, + "learning_rate": 2.5617856267625373e-05, + "loss": 2.756, + "step": 4706000 + }, + { + "epoch": 1.4630840562229646, + "grad_norm": 12.721713066101074, + "learning_rate": 2.561526572961726e-05, + "loss": 2.7676, + "step": 4706500 + }, + { + "epoch": 1.4632394885034514, + "grad_norm": 9.793500900268555, + "learning_rate": 2.5612675191609144e-05, + "loss": 2.7893, + "step": 4707000 + }, + { + "epoch": 1.4633949207839383, + "grad_norm": 14.692646026611328, + "learning_rate": 2.561008465360103e-05, + "loss": 2.8067, + "step": 4707500 + }, + { + "epoch": 1.4635503530644252, + "grad_norm": 9.359505653381348, + "learning_rate": 2.5607494115592918e-05, + "loss": 2.7771, + "step": 4708000 + }, + { + "epoch": 1.463705785344912, + "grad_norm": 12.003331184387207, + "learning_rate": 2.56049035775848e-05, + "loss": 2.771, + "step": 4708500 + }, + { + "epoch": 1.463861217625399, + "grad_norm": 71.33354187011719, + "learning_rate": 2.5602313039576685e-05, + "loss": 2.8058, + "step": 4709000 + }, + { + "epoch": 1.4640166499058858, + "grad_norm": 8.798882484436035, + "learning_rate": 2.5599722501568573e-05, + "loss": 2.7923, + "step": 4709500 + }, + { + "epoch": 1.4641720821863726, + "grad_norm": 10.31511116027832, + "learning_rate": 2.5597131963560456e-05, + "loss": 2.804, + "step": 4710000 + }, + { + "epoch": 1.4643275144668595, + "grad_norm": 9.025091171264648, + "learning_rate": 2.5594541425552344e-05, + "loss": 2.7916, + "step": 4710500 + }, + { + "epoch": 1.4644829467473464, + "grad_norm": 9.792966842651367, + "learning_rate": 2.559195088754423e-05, + "loss": 2.752, + "step": 4711000 + }, + { + "epoch": 1.4646383790278332, + "grad_norm": 10.617185592651367, + "learning_rate": 2.558936034953611e-05, + "loss": 2.8056, + "step": 4711500 + }, + { + "epoch": 1.46479381130832, + "grad_norm": 27.28983497619629, + "learning_rate": 2.5586769811527998e-05, + "loss": 2.815, + "step": 4712000 + }, + { + "epoch": 1.464949243588807, + "grad_norm": 15.765388488769531, + "learning_rate": 2.5584179273519885e-05, + "loss": 2.7979, + "step": 4712500 + }, + { + "epoch": 1.4651046758692938, + "grad_norm": 11.943466186523438, + "learning_rate": 2.558158873551177e-05, + "loss": 2.7867, + "step": 4713000 + }, + { + "epoch": 1.4652601081497807, + "grad_norm": 7.0924553871154785, + "learning_rate": 2.5578998197503656e-05, + "loss": 2.7665, + "step": 4713500 + }, + { + "epoch": 1.4654155404302676, + "grad_norm": 7.798406600952148, + "learning_rate": 2.5576407659495537e-05, + "loss": 2.8375, + "step": 4714000 + }, + { + "epoch": 1.4655709727107546, + "grad_norm": 10.009363174438477, + "learning_rate": 2.5573817121487427e-05, + "loss": 2.7362, + "step": 4714500 + }, + { + "epoch": 1.4657264049912415, + "grad_norm": 12.048370361328125, + "learning_rate": 2.5571226583479314e-05, + "loss": 2.7552, + "step": 4715000 + }, + { + "epoch": 1.4658818372717284, + "grad_norm": 9.993474960327148, + "learning_rate": 2.5568636045471195e-05, + "loss": 2.8095, + "step": 4715500 + }, + { + "epoch": 1.4660372695522152, + "grad_norm": 20.247438430786133, + "learning_rate": 2.5566045507463082e-05, + "loss": 2.8388, + "step": 4716000 + }, + { + "epoch": 1.4661927018327021, + "grad_norm": 15.593304634094238, + "learning_rate": 2.556345496945497e-05, + "loss": 2.7493, + "step": 4716500 + }, + { + "epoch": 1.466348134113189, + "grad_norm": 11.748769760131836, + "learning_rate": 2.5560864431446853e-05, + "loss": 2.7851, + "step": 4717000 + }, + { + "epoch": 1.4665035663936759, + "grad_norm": 8.697288513183594, + "learning_rate": 2.555827389343874e-05, + "loss": 2.7705, + "step": 4717500 + }, + { + "epoch": 1.4666589986741627, + "grad_norm": 19.414857864379883, + "learning_rate": 2.5555683355430627e-05, + "loss": 2.7579, + "step": 4718000 + }, + { + "epoch": 1.4668144309546496, + "grad_norm": 8.28676700592041, + "learning_rate": 2.5553092817422508e-05, + "loss": 2.759, + "step": 4718500 + }, + { + "epoch": 1.4669698632351365, + "grad_norm": 8.914092063903809, + "learning_rate": 2.5550502279414395e-05, + "loss": 2.8036, + "step": 4719000 + }, + { + "epoch": 1.4671252955156233, + "grad_norm": 12.21248722076416, + "learning_rate": 2.554791174140628e-05, + "loss": 2.7804, + "step": 4719500 + }, + { + "epoch": 1.4672807277961102, + "grad_norm": 8.244056701660156, + "learning_rate": 2.5545321203398166e-05, + "loss": 2.7953, + "step": 4720000 + }, + { + "epoch": 1.467436160076597, + "grad_norm": 10.206940650939941, + "learning_rate": 2.5542730665390053e-05, + "loss": 2.787, + "step": 4720500 + }, + { + "epoch": 1.467591592357084, + "grad_norm": 11.30848217010498, + "learning_rate": 2.5540140127381933e-05, + "loss": 2.8028, + "step": 4721000 + }, + { + "epoch": 1.4677470246375708, + "grad_norm": 10.763299942016602, + "learning_rate": 2.553754958937382e-05, + "loss": 2.804, + "step": 4721500 + }, + { + "epoch": 1.4679024569180577, + "grad_norm": 9.418049812316895, + "learning_rate": 2.5534959051365707e-05, + "loss": 2.8421, + "step": 4722000 + }, + { + "epoch": 1.4680578891985445, + "grad_norm": 9.473061561584473, + "learning_rate": 2.553236851335759e-05, + "loss": 2.7715, + "step": 4722500 + }, + { + "epoch": 1.4682133214790314, + "grad_norm": 9.918183326721191, + "learning_rate": 2.552977797534948e-05, + "loss": 2.7795, + "step": 4723000 + }, + { + "epoch": 1.4683687537595183, + "grad_norm": 9.139067649841309, + "learning_rate": 2.5527187437341365e-05, + "loss": 2.7915, + "step": 4723500 + }, + { + "epoch": 1.4685241860400051, + "grad_norm": 11.25169849395752, + "learning_rate": 2.5524596899333246e-05, + "loss": 2.7747, + "step": 4724000 + }, + { + "epoch": 1.468679618320492, + "grad_norm": 12.359132766723633, + "learning_rate": 2.5522006361325136e-05, + "loss": 2.7774, + "step": 4724500 + }, + { + "epoch": 1.4688350506009789, + "grad_norm": 8.416899681091309, + "learning_rate": 2.5519415823317017e-05, + "loss": 2.7437, + "step": 4725000 + }, + { + "epoch": 1.4689904828814657, + "grad_norm": 8.506092071533203, + "learning_rate": 2.5516825285308904e-05, + "loss": 2.8093, + "step": 4725500 + }, + { + "epoch": 1.4691459151619526, + "grad_norm": 7.694880485534668, + "learning_rate": 2.551423474730079e-05, + "loss": 2.7874, + "step": 4726000 + }, + { + "epoch": 1.4693013474424395, + "grad_norm": 8.12295913696289, + "learning_rate": 2.5511644209292675e-05, + "loss": 2.8008, + "step": 4726500 + }, + { + "epoch": 1.4694567797229263, + "grad_norm": 9.651259422302246, + "learning_rate": 2.5509053671284562e-05, + "loss": 2.8146, + "step": 4727000 + }, + { + "epoch": 1.4696122120034132, + "grad_norm": 8.837837219238281, + "learning_rate": 2.550646313327645e-05, + "loss": 2.7437, + "step": 4727500 + }, + { + "epoch": 1.4697676442839, + "grad_norm": 10.666661262512207, + "learning_rate": 2.550387259526833e-05, + "loss": 2.7635, + "step": 4728000 + }, + { + "epoch": 1.469923076564387, + "grad_norm": 6.948192119598389, + "learning_rate": 2.5501282057260217e-05, + "loss": 2.7652, + "step": 4728500 + }, + { + "epoch": 1.4700785088448738, + "grad_norm": 9.273359298706055, + "learning_rate": 2.5498691519252104e-05, + "loss": 2.7316, + "step": 4729000 + }, + { + "epoch": 1.4702339411253609, + "grad_norm": 7.17586088180542, + "learning_rate": 2.5496100981243988e-05, + "loss": 2.803, + "step": 4729500 + }, + { + "epoch": 1.4703893734058477, + "grad_norm": 7.916642189025879, + "learning_rate": 2.5493510443235875e-05, + "loss": 2.7993, + "step": 4730000 + }, + { + "epoch": 1.4705448056863346, + "grad_norm": 13.771820068359375, + "learning_rate": 2.5490919905227762e-05, + "loss": 2.8164, + "step": 4730500 + }, + { + "epoch": 1.4707002379668215, + "grad_norm": 30.05401039123535, + "learning_rate": 2.5488329367219642e-05, + "loss": 2.734, + "step": 4731000 + }, + { + "epoch": 1.4708556702473083, + "grad_norm": 10.340119361877441, + "learning_rate": 2.548573882921153e-05, + "loss": 2.788, + "step": 4731500 + }, + { + "epoch": 1.4710111025277952, + "grad_norm": 8.538993835449219, + "learning_rate": 2.5483148291203413e-05, + "loss": 2.8036, + "step": 4732000 + }, + { + "epoch": 1.471166534808282, + "grad_norm": 8.922978401184082, + "learning_rate": 2.54805577531953e-05, + "loss": 2.8303, + "step": 4732500 + }, + { + "epoch": 1.471321967088769, + "grad_norm": 8.317039489746094, + "learning_rate": 2.5477967215187188e-05, + "loss": 2.7904, + "step": 4733000 + }, + { + "epoch": 1.4714773993692558, + "grad_norm": 8.932711601257324, + "learning_rate": 2.5475376677179068e-05, + "loss": 2.8318, + "step": 4733500 + }, + { + "epoch": 1.4716328316497427, + "grad_norm": 9.552082061767578, + "learning_rate": 2.5472786139170955e-05, + "loss": 2.7578, + "step": 4734000 + }, + { + "epoch": 1.4717882639302295, + "grad_norm": 11.84228515625, + "learning_rate": 2.5470195601162846e-05, + "loss": 2.7989, + "step": 4734500 + }, + { + "epoch": 1.4719436962107164, + "grad_norm": 9.521279335021973, + "learning_rate": 2.5467605063154726e-05, + "loss": 2.8152, + "step": 4735000 + }, + { + "epoch": 1.4720991284912033, + "grad_norm": 27.38538932800293, + "learning_rate": 2.5465014525146613e-05, + "loss": 2.8371, + "step": 4735500 + }, + { + "epoch": 1.4722545607716901, + "grad_norm": 9.634171485900879, + "learning_rate": 2.54624239871385e-05, + "loss": 2.8503, + "step": 4736000 + }, + { + "epoch": 1.472409993052177, + "grad_norm": 9.627704620361328, + "learning_rate": 2.5459833449130384e-05, + "loss": 2.8005, + "step": 4736500 + }, + { + "epoch": 1.4725654253326639, + "grad_norm": 9.395179748535156, + "learning_rate": 2.545724291112227e-05, + "loss": 2.7373, + "step": 4737000 + }, + { + "epoch": 1.4727208576131507, + "grad_norm": 11.9993896484375, + "learning_rate": 2.545465237311415e-05, + "loss": 2.7389, + "step": 4737500 + }, + { + "epoch": 1.4728762898936376, + "grad_norm": 10.177509307861328, + "learning_rate": 2.545206183510604e-05, + "loss": 2.7801, + "step": 4738000 + }, + { + "epoch": 1.4730317221741247, + "grad_norm": 37.55833435058594, + "learning_rate": 2.5449471297097926e-05, + "loss": 2.8146, + "step": 4738500 + }, + { + "epoch": 1.4731871544546116, + "grad_norm": 9.72758960723877, + "learning_rate": 2.544688075908981e-05, + "loss": 2.7744, + "step": 4739000 + }, + { + "epoch": 1.4733425867350984, + "grad_norm": 10.852993965148926, + "learning_rate": 2.5444290221081697e-05, + "loss": 2.7879, + "step": 4739500 + }, + { + "epoch": 1.4734980190155853, + "grad_norm": 8.163925170898438, + "learning_rate": 2.5441699683073584e-05, + "loss": 2.7908, + "step": 4740000 + }, + { + "epoch": 1.4736534512960722, + "grad_norm": 7.669185638427734, + "learning_rate": 2.5439109145065464e-05, + "loss": 2.7344, + "step": 4740500 + }, + { + "epoch": 1.473808883576559, + "grad_norm": 9.283528327941895, + "learning_rate": 2.543651860705735e-05, + "loss": 2.7715, + "step": 4741000 + }, + { + "epoch": 1.473964315857046, + "grad_norm": 26.63079261779785, + "learning_rate": 2.543392806904924e-05, + "loss": 2.7881, + "step": 4741500 + }, + { + "epoch": 1.4741197481375328, + "grad_norm": 8.915225982666016, + "learning_rate": 2.5431337531041122e-05, + "loss": 2.7937, + "step": 4742000 + }, + { + "epoch": 1.4742751804180196, + "grad_norm": 13.517271041870117, + "learning_rate": 2.542874699303301e-05, + "loss": 2.8047, + "step": 4742500 + }, + { + "epoch": 1.4744306126985065, + "grad_norm": 12.171009063720703, + "learning_rate": 2.542615645502489e-05, + "loss": 2.7942, + "step": 4743000 + }, + { + "epoch": 1.4745860449789934, + "grad_norm": 8.865447998046875, + "learning_rate": 2.5423565917016777e-05, + "loss": 2.7996, + "step": 4743500 + }, + { + "epoch": 1.4747414772594802, + "grad_norm": 8.357906341552734, + "learning_rate": 2.5420975379008664e-05, + "loss": 2.7685, + "step": 4744000 + }, + { + "epoch": 1.474896909539967, + "grad_norm": 10.073769569396973, + "learning_rate": 2.5418384841000548e-05, + "loss": 2.7767, + "step": 4744500 + }, + { + "epoch": 1.475052341820454, + "grad_norm": 17.509506225585938, + "learning_rate": 2.5415794302992435e-05, + "loss": 2.7555, + "step": 4745000 + }, + { + "epoch": 1.4752077741009408, + "grad_norm": 7.916831970214844, + "learning_rate": 2.5413203764984322e-05, + "loss": 2.7626, + "step": 4745500 + }, + { + "epoch": 1.4753632063814277, + "grad_norm": 12.238476753234863, + "learning_rate": 2.5410613226976203e-05, + "loss": 2.7626, + "step": 4746000 + }, + { + "epoch": 1.4755186386619146, + "grad_norm": 10.236080169677734, + "learning_rate": 2.5408022688968093e-05, + "loss": 2.7589, + "step": 4746500 + }, + { + "epoch": 1.4756740709424014, + "grad_norm": 19.46315574645996, + "learning_rate": 2.540543215095998e-05, + "loss": 2.7742, + "step": 4747000 + }, + { + "epoch": 1.4758295032228883, + "grad_norm": 8.273650169372559, + "learning_rate": 2.540284161295186e-05, + "loss": 2.819, + "step": 4747500 + }, + { + "epoch": 1.4759849355033752, + "grad_norm": 12.42080020904541, + "learning_rate": 2.5400251074943748e-05, + "loss": 2.8082, + "step": 4748000 + }, + { + "epoch": 1.476140367783862, + "grad_norm": 9.542378425598145, + "learning_rate": 2.5397660536935635e-05, + "loss": 2.7895, + "step": 4748500 + }, + { + "epoch": 1.476295800064349, + "grad_norm": 10.23065185546875, + "learning_rate": 2.539506999892752e-05, + "loss": 2.8116, + "step": 4749000 + }, + { + "epoch": 1.4764512323448358, + "grad_norm": 17.127731323242188, + "learning_rate": 2.5392479460919406e-05, + "loss": 2.7827, + "step": 4749500 + }, + { + "epoch": 1.4766066646253226, + "grad_norm": 9.382108688354492, + "learning_rate": 2.5389888922911286e-05, + "loss": 2.786, + "step": 4750000 + }, + { + "epoch": 1.4767620969058095, + "grad_norm": 7.362612724304199, + "learning_rate": 2.5387298384903174e-05, + "loss": 2.82, + "step": 4750500 + }, + { + "epoch": 1.4769175291862964, + "grad_norm": 9.058055877685547, + "learning_rate": 2.538470784689506e-05, + "loss": 2.7811, + "step": 4751000 + }, + { + "epoch": 1.4770729614667832, + "grad_norm": 8.013850212097168, + "learning_rate": 2.5382117308886944e-05, + "loss": 2.7995, + "step": 4751500 + }, + { + "epoch": 1.47722839374727, + "grad_norm": 9.503411293029785, + "learning_rate": 2.537952677087883e-05, + "loss": 2.7559, + "step": 4752000 + }, + { + "epoch": 1.477383826027757, + "grad_norm": 8.905311584472656, + "learning_rate": 2.537693623287072e-05, + "loss": 2.7824, + "step": 4752500 + }, + { + "epoch": 1.4775392583082438, + "grad_norm": 9.91280746459961, + "learning_rate": 2.53743456948626e-05, + "loss": 2.7705, + "step": 4753000 + }, + { + "epoch": 1.477694690588731, + "grad_norm": 9.825065612792969, + "learning_rate": 2.5371755156854486e-05, + "loss": 2.8, + "step": 4753500 + }, + { + "epoch": 1.4778501228692178, + "grad_norm": 7.652711391448975, + "learning_rate": 2.5369164618846373e-05, + "loss": 2.8069, + "step": 4754000 + }, + { + "epoch": 1.4780055551497047, + "grad_norm": 19.141765594482422, + "learning_rate": 2.5366574080838257e-05, + "loss": 2.7661, + "step": 4754500 + }, + { + "epoch": 1.4781609874301915, + "grad_norm": 9.519120216369629, + "learning_rate": 2.5363983542830144e-05, + "loss": 2.7886, + "step": 4755000 + }, + { + "epoch": 1.4783164197106784, + "grad_norm": 8.168241500854492, + "learning_rate": 2.5361393004822025e-05, + "loss": 2.7708, + "step": 4755500 + }, + { + "epoch": 1.4784718519911653, + "grad_norm": 8.335256576538086, + "learning_rate": 2.5358802466813912e-05, + "loss": 2.8456, + "step": 4756000 + }, + { + "epoch": 1.4786272842716521, + "grad_norm": 9.390768051147461, + "learning_rate": 2.5356211928805802e-05, + "loss": 2.7817, + "step": 4756500 + }, + { + "epoch": 1.478782716552139, + "grad_norm": 10.080512046813965, + "learning_rate": 2.5353621390797683e-05, + "loss": 2.8259, + "step": 4757000 + }, + { + "epoch": 1.4789381488326259, + "grad_norm": 8.324600219726562, + "learning_rate": 2.535103085278957e-05, + "loss": 2.7397, + "step": 4757500 + }, + { + "epoch": 1.4790935811131127, + "grad_norm": 8.314863204956055, + "learning_rate": 2.5348440314781457e-05, + "loss": 2.7685, + "step": 4758000 + }, + { + "epoch": 1.4792490133935996, + "grad_norm": 5.986328125, + "learning_rate": 2.534584977677334e-05, + "loss": 2.766, + "step": 4758500 + }, + { + "epoch": 1.4794044456740865, + "grad_norm": 8.377809524536133, + "learning_rate": 2.5343259238765228e-05, + "loss": 2.788, + "step": 4759000 + }, + { + "epoch": 1.4795598779545733, + "grad_norm": 10.35903549194336, + "learning_rate": 2.5340668700757115e-05, + "loss": 2.7635, + "step": 4759500 + }, + { + "epoch": 1.4797153102350602, + "grad_norm": 8.349854469299316, + "learning_rate": 2.5338078162748996e-05, + "loss": 2.7881, + "step": 4760000 + }, + { + "epoch": 1.479870742515547, + "grad_norm": 8.617098808288574, + "learning_rate": 2.5335487624740883e-05, + "loss": 2.7912, + "step": 4760500 + }, + { + "epoch": 1.480026174796034, + "grad_norm": 10.881156921386719, + "learning_rate": 2.5332897086732766e-05, + "loss": 2.81, + "step": 4761000 + }, + { + "epoch": 1.4801816070765208, + "grad_norm": 7.760189056396484, + "learning_rate": 2.5330306548724654e-05, + "loss": 2.7386, + "step": 4761500 + }, + { + "epoch": 1.4803370393570077, + "grad_norm": 8.639976501464844, + "learning_rate": 2.532771601071654e-05, + "loss": 2.7589, + "step": 4762000 + }, + { + "epoch": 1.4804924716374948, + "grad_norm": 10.33633804321289, + "learning_rate": 2.532512547270842e-05, + "loss": 2.782, + "step": 4762500 + }, + { + "epoch": 1.4806479039179816, + "grad_norm": 9.167875289916992, + "learning_rate": 2.532253493470031e-05, + "loss": 2.7804, + "step": 4763000 + }, + { + "epoch": 1.4808033361984685, + "grad_norm": 9.002975463867188, + "learning_rate": 2.5319944396692195e-05, + "loss": 2.8076, + "step": 4763500 + }, + { + "epoch": 1.4809587684789554, + "grad_norm": 15.100875854492188, + "learning_rate": 2.531735385868408e-05, + "loss": 2.7858, + "step": 4764000 + }, + { + "epoch": 1.4811142007594422, + "grad_norm": 11.758209228515625, + "learning_rate": 2.5314763320675966e-05, + "loss": 2.7867, + "step": 4764500 + }, + { + "epoch": 1.481269633039929, + "grad_norm": 9.279455184936523, + "learning_rate": 2.5312172782667854e-05, + "loss": 2.802, + "step": 4765000 + }, + { + "epoch": 1.481425065320416, + "grad_norm": 7.2642011642456055, + "learning_rate": 2.5309582244659734e-05, + "loss": 2.787, + "step": 4765500 + }, + { + "epoch": 1.4815804976009028, + "grad_norm": 9.065606117248535, + "learning_rate": 2.530699170665162e-05, + "loss": 2.7683, + "step": 4766000 + }, + { + "epoch": 1.4817359298813897, + "grad_norm": 10.984664916992188, + "learning_rate": 2.530440116864351e-05, + "loss": 2.8193, + "step": 4766500 + }, + { + "epoch": 1.4818913621618766, + "grad_norm": 13.436629295349121, + "learning_rate": 2.5301810630635392e-05, + "loss": 2.795, + "step": 4767000 + }, + { + "epoch": 1.4820467944423634, + "grad_norm": 9.748923301696777, + "learning_rate": 2.529922009262728e-05, + "loss": 2.7804, + "step": 4767500 + }, + { + "epoch": 1.4822022267228503, + "grad_norm": 9.401044845581055, + "learning_rate": 2.529662955461916e-05, + "loss": 2.7983, + "step": 4768000 + }, + { + "epoch": 1.4823576590033372, + "grad_norm": 9.407771110534668, + "learning_rate": 2.529403901661105e-05, + "loss": 2.7916, + "step": 4768500 + }, + { + "epoch": 1.482513091283824, + "grad_norm": 10.571331977844238, + "learning_rate": 2.5291448478602937e-05, + "loss": 2.8489, + "step": 4769000 + }, + { + "epoch": 1.482668523564311, + "grad_norm": 9.397067070007324, + "learning_rate": 2.5288857940594818e-05, + "loss": 2.7693, + "step": 4769500 + }, + { + "epoch": 1.4828239558447978, + "grad_norm": 10.228026390075684, + "learning_rate": 2.5286267402586705e-05, + "loss": 2.8248, + "step": 4770000 + }, + { + "epoch": 1.4829793881252846, + "grad_norm": 14.382258415222168, + "learning_rate": 2.5283676864578592e-05, + "loss": 2.7809, + "step": 4770500 + }, + { + "epoch": 1.4831348204057715, + "grad_norm": 11.103045463562012, + "learning_rate": 2.5281086326570476e-05, + "loss": 2.7458, + "step": 4771000 + }, + { + "epoch": 1.4832902526862584, + "grad_norm": 8.96082592010498, + "learning_rate": 2.5278495788562363e-05, + "loss": 2.7601, + "step": 4771500 + }, + { + "epoch": 1.4834456849667452, + "grad_norm": 25.81154441833496, + "learning_rate": 2.527590525055425e-05, + "loss": 2.8057, + "step": 4772000 + }, + { + "epoch": 1.483601117247232, + "grad_norm": 29.56778335571289, + "learning_rate": 2.527331471254613e-05, + "loss": 2.7639, + "step": 4772500 + }, + { + "epoch": 1.483756549527719, + "grad_norm": 11.653508186340332, + "learning_rate": 2.5270724174538017e-05, + "loss": 2.7519, + "step": 4773000 + }, + { + "epoch": 1.4839119818082058, + "grad_norm": 13.539255142211914, + "learning_rate": 2.52681336365299e-05, + "loss": 2.7494, + "step": 4773500 + }, + { + "epoch": 1.4840674140886927, + "grad_norm": 8.648114204406738, + "learning_rate": 2.526554309852179e-05, + "loss": 2.7802, + "step": 4774000 + }, + { + "epoch": 1.4842228463691796, + "grad_norm": 9.478727340698242, + "learning_rate": 2.5262952560513676e-05, + "loss": 2.7883, + "step": 4774500 + }, + { + "epoch": 1.4843782786496664, + "grad_norm": 7.8830108642578125, + "learning_rate": 2.5260362022505556e-05, + "loss": 2.7637, + "step": 4775000 + }, + { + "epoch": 1.4845337109301533, + "grad_norm": 7.0982441902160645, + "learning_rate": 2.5257771484497443e-05, + "loss": 2.8, + "step": 4775500 + }, + { + "epoch": 1.4846891432106402, + "grad_norm": 8.943832397460938, + "learning_rate": 2.525518094648933e-05, + "loss": 2.7863, + "step": 4776000 + }, + { + "epoch": 1.484844575491127, + "grad_norm": 7.58902645111084, + "learning_rate": 2.5252590408481214e-05, + "loss": 2.8054, + "step": 4776500 + }, + { + "epoch": 1.485000007771614, + "grad_norm": 8.032527923583984, + "learning_rate": 2.52499998704731e-05, + "loss": 2.7646, + "step": 4777000 + }, + { + "epoch": 1.485155440052101, + "grad_norm": 8.963438987731934, + "learning_rate": 2.524740933246499e-05, + "loss": 2.7752, + "step": 4777500 + }, + { + "epoch": 1.4853108723325879, + "grad_norm": 8.775457382202148, + "learning_rate": 2.524481879445687e-05, + "loss": 2.7704, + "step": 4778000 + }, + { + "epoch": 1.4854663046130747, + "grad_norm": 7.664243698120117, + "learning_rate": 2.524222825644876e-05, + "loss": 2.814, + "step": 4778500 + }, + { + "epoch": 1.4856217368935616, + "grad_norm": 9.11829948425293, + "learning_rate": 2.523963771844064e-05, + "loss": 2.7755, + "step": 4779000 + }, + { + "epoch": 1.4857771691740485, + "grad_norm": 9.005972862243652, + "learning_rate": 2.5237047180432527e-05, + "loss": 2.7612, + "step": 4779500 + }, + { + "epoch": 1.4859326014545353, + "grad_norm": 9.911689758300781, + "learning_rate": 2.5234456642424414e-05, + "loss": 2.7479, + "step": 4780000 + }, + { + "epoch": 1.4860880337350222, + "grad_norm": 9.002018928527832, + "learning_rate": 2.5231866104416298e-05, + "loss": 2.8396, + "step": 4780500 + }, + { + "epoch": 1.486243466015509, + "grad_norm": 12.56339168548584, + "learning_rate": 2.5229275566408185e-05, + "loss": 2.7866, + "step": 4781000 + }, + { + "epoch": 1.486398898295996, + "grad_norm": 8.476409912109375, + "learning_rate": 2.5226685028400072e-05, + "loss": 2.7977, + "step": 4781500 + }, + { + "epoch": 1.4865543305764828, + "grad_norm": 16.670534133911133, + "learning_rate": 2.5224094490391952e-05, + "loss": 2.7518, + "step": 4782000 + }, + { + "epoch": 1.4867097628569697, + "grad_norm": 9.725967407226562, + "learning_rate": 2.522150395238384e-05, + "loss": 2.7611, + "step": 4782500 + }, + { + "epoch": 1.4868651951374565, + "grad_norm": 8.284482955932617, + "learning_rate": 2.5218913414375727e-05, + "loss": 2.7434, + "step": 4783000 + }, + { + "epoch": 1.4870206274179434, + "grad_norm": 7.308860778808594, + "learning_rate": 2.521632287636761e-05, + "loss": 2.839, + "step": 4783500 + }, + { + "epoch": 1.4871760596984303, + "grad_norm": 9.190062522888184, + "learning_rate": 2.5213732338359498e-05, + "loss": 2.7761, + "step": 4784000 + }, + { + "epoch": 1.4873314919789171, + "grad_norm": 7.921609878540039, + "learning_rate": 2.5211141800351385e-05, + "loss": 2.752, + "step": 4784500 + }, + { + "epoch": 1.487486924259404, + "grad_norm": 9.744239807128906, + "learning_rate": 2.5208551262343265e-05, + "loss": 2.777, + "step": 4785000 + }, + { + "epoch": 1.4876423565398909, + "grad_norm": 21.314043045043945, + "learning_rate": 2.5205960724335152e-05, + "loss": 2.7601, + "step": 4785500 + }, + { + "epoch": 1.4877977888203777, + "grad_norm": 10.74771499633789, + "learning_rate": 2.5203370186327036e-05, + "loss": 2.7772, + "step": 4786000 + }, + { + "epoch": 1.4879532211008646, + "grad_norm": 9.366706848144531, + "learning_rate": 2.5200779648318923e-05, + "loss": 2.7862, + "step": 4786500 + }, + { + "epoch": 1.4881086533813517, + "grad_norm": 11.968894958496094, + "learning_rate": 2.519818911031081e-05, + "loss": 2.7684, + "step": 4787000 + }, + { + "epoch": 1.4882640856618385, + "grad_norm": 16.41863250732422, + "learning_rate": 2.519559857230269e-05, + "loss": 2.7695, + "step": 4787500 + }, + { + "epoch": 1.4884195179423254, + "grad_norm": 8.332488059997559, + "learning_rate": 2.5193008034294578e-05, + "loss": 2.7546, + "step": 4788000 + }, + { + "epoch": 1.4885749502228123, + "grad_norm": 11.112042427062988, + "learning_rate": 2.519041749628647e-05, + "loss": 2.8076, + "step": 4788500 + }, + { + "epoch": 1.4887303825032991, + "grad_norm": 12.179306030273438, + "learning_rate": 2.518782695827835e-05, + "loss": 2.7996, + "step": 4789000 + }, + { + "epoch": 1.488885814783786, + "grad_norm": 8.776727676391602, + "learning_rate": 2.5185236420270236e-05, + "loss": 2.8126, + "step": 4789500 + }, + { + "epoch": 1.4890412470642729, + "grad_norm": 8.717992782592773, + "learning_rate": 2.5182645882262123e-05, + "loss": 2.7898, + "step": 4790000 + }, + { + "epoch": 1.4891966793447597, + "grad_norm": 8.054593086242676, + "learning_rate": 2.5180055344254007e-05, + "loss": 2.7817, + "step": 4790500 + }, + { + "epoch": 1.4893521116252466, + "grad_norm": 9.26719856262207, + "learning_rate": 2.5177464806245894e-05, + "loss": 2.7606, + "step": 4791000 + }, + { + "epoch": 1.4895075439057335, + "grad_norm": 11.862593650817871, + "learning_rate": 2.5174874268237774e-05, + "loss": 2.8321, + "step": 4791500 + }, + { + "epoch": 1.4896629761862203, + "grad_norm": 9.473904609680176, + "learning_rate": 2.517228373022966e-05, + "loss": 2.741, + "step": 4792000 + }, + { + "epoch": 1.4898184084667072, + "grad_norm": 30.039287567138672, + "learning_rate": 2.516969319222155e-05, + "loss": 2.7353, + "step": 4792500 + }, + { + "epoch": 1.489973840747194, + "grad_norm": 9.40739631652832, + "learning_rate": 2.5167102654213432e-05, + "loss": 2.7924, + "step": 4793000 + }, + { + "epoch": 1.490129273027681, + "grad_norm": 11.058066368103027, + "learning_rate": 2.516451211620532e-05, + "loss": 2.7673, + "step": 4793500 + }, + { + "epoch": 1.4902847053081678, + "grad_norm": 11.487042427062988, + "learning_rate": 2.5161921578197207e-05, + "loss": 2.8005, + "step": 4794000 + }, + { + "epoch": 1.4904401375886547, + "grad_norm": 8.755714416503906, + "learning_rate": 2.5159331040189087e-05, + "loss": 2.8313, + "step": 4794500 + }, + { + "epoch": 1.4905955698691415, + "grad_norm": 9.348710060119629, + "learning_rate": 2.5156740502180974e-05, + "loss": 2.8004, + "step": 4795000 + }, + { + "epoch": 1.4907510021496284, + "grad_norm": 12.1837739944458, + "learning_rate": 2.515414996417286e-05, + "loss": 2.8031, + "step": 4795500 + }, + { + "epoch": 1.4909064344301153, + "grad_norm": 7.837753772735596, + "learning_rate": 2.5151559426164745e-05, + "loss": 2.7502, + "step": 4796000 + }, + { + "epoch": 1.4910618667106021, + "grad_norm": 8.107683181762695, + "learning_rate": 2.5148968888156632e-05, + "loss": 2.7656, + "step": 4796500 + }, + { + "epoch": 1.491217298991089, + "grad_norm": 9.515279769897461, + "learning_rate": 2.514637835014852e-05, + "loss": 2.7715, + "step": 4797000 + }, + { + "epoch": 1.4913727312715759, + "grad_norm": 14.494778633117676, + "learning_rate": 2.51437878121404e-05, + "loss": 2.8009, + "step": 4797500 + }, + { + "epoch": 1.4915281635520627, + "grad_norm": 8.613555908203125, + "learning_rate": 2.5141197274132287e-05, + "loss": 2.7809, + "step": 4798000 + }, + { + "epoch": 1.4916835958325496, + "grad_norm": 10.439467430114746, + "learning_rate": 2.513860673612417e-05, + "loss": 2.8125, + "step": 4798500 + }, + { + "epoch": 1.4918390281130365, + "grad_norm": 8.867156982421875, + "learning_rate": 2.5136016198116058e-05, + "loss": 2.7837, + "step": 4799000 + }, + { + "epoch": 1.4919944603935233, + "grad_norm": 7.878233432769775, + "learning_rate": 2.5133425660107945e-05, + "loss": 2.7826, + "step": 4799500 + }, + { + "epoch": 1.4921498926740102, + "grad_norm": 7.486385822296143, + "learning_rate": 2.513083512209983e-05, + "loss": 2.757, + "step": 4800000 + }, + { + "epoch": 1.492305324954497, + "grad_norm": 17.576351165771484, + "learning_rate": 2.5128244584091716e-05, + "loss": 2.7984, + "step": 4800500 + }, + { + "epoch": 1.492460757234984, + "grad_norm": 14.875651359558105, + "learning_rate": 2.5125654046083603e-05, + "loss": 2.774, + "step": 4801000 + }, + { + "epoch": 1.4926161895154708, + "grad_norm": 7.574766159057617, + "learning_rate": 2.5123063508075484e-05, + "loss": 2.7921, + "step": 4801500 + }, + { + "epoch": 1.492771621795958, + "grad_norm": 9.175198554992676, + "learning_rate": 2.512047297006737e-05, + "loss": 2.8366, + "step": 4802000 + }, + { + "epoch": 1.4929270540764448, + "grad_norm": 9.723220825195312, + "learning_rate": 2.5117882432059258e-05, + "loss": 2.8276, + "step": 4802500 + }, + { + "epoch": 1.4930824863569316, + "grad_norm": 22.55112075805664, + "learning_rate": 2.511529189405114e-05, + "loss": 2.7168, + "step": 4803000 + }, + { + "epoch": 1.4932379186374185, + "grad_norm": 9.236189842224121, + "learning_rate": 2.511270135604303e-05, + "loss": 2.7631, + "step": 4803500 + }, + { + "epoch": 1.4933933509179054, + "grad_norm": 32.85920715332031, + "learning_rate": 2.511011081803491e-05, + "loss": 2.8501, + "step": 4804000 + }, + { + "epoch": 1.4935487831983922, + "grad_norm": 11.794550895690918, + "learning_rate": 2.5107520280026796e-05, + "loss": 2.7521, + "step": 4804500 + }, + { + "epoch": 1.493704215478879, + "grad_norm": 55.9468879699707, + "learning_rate": 2.5104929742018684e-05, + "loss": 2.7836, + "step": 4805000 + }, + { + "epoch": 1.493859647759366, + "grad_norm": 9.583039283752441, + "learning_rate": 2.5102339204010567e-05, + "loss": 2.8278, + "step": 4805500 + }, + { + "epoch": 1.4940150800398528, + "grad_norm": 8.877067565917969, + "learning_rate": 2.5099748666002454e-05, + "loss": 2.8025, + "step": 4806000 + }, + { + "epoch": 1.4941705123203397, + "grad_norm": 9.837348937988281, + "learning_rate": 2.509715812799434e-05, + "loss": 2.8548, + "step": 4806500 + }, + { + "epoch": 1.4943259446008266, + "grad_norm": 21.816747665405273, + "learning_rate": 2.5094567589986222e-05, + "loss": 2.8167, + "step": 4807000 + }, + { + "epoch": 1.4944813768813134, + "grad_norm": 9.796653747558594, + "learning_rate": 2.509197705197811e-05, + "loss": 2.753, + "step": 4807500 + }, + { + "epoch": 1.4946368091618003, + "grad_norm": 25.594959259033203, + "learning_rate": 2.5089386513969996e-05, + "loss": 2.7512, + "step": 4808000 + }, + { + "epoch": 1.4947922414422872, + "grad_norm": 9.20024299621582, + "learning_rate": 2.508679597596188e-05, + "loss": 2.7908, + "step": 4808500 + }, + { + "epoch": 1.494947673722774, + "grad_norm": 11.175650596618652, + "learning_rate": 2.5084205437953767e-05, + "loss": 2.7655, + "step": 4809000 + }, + { + "epoch": 1.495103106003261, + "grad_norm": 10.380213737487793, + "learning_rate": 2.5081614899945648e-05, + "loss": 2.7636, + "step": 4809500 + }, + { + "epoch": 1.4952585382837478, + "grad_norm": 10.73068904876709, + "learning_rate": 2.5079024361937538e-05, + "loss": 2.8066, + "step": 4810000 + }, + { + "epoch": 1.4954139705642346, + "grad_norm": 10.12824535369873, + "learning_rate": 2.5076433823929425e-05, + "loss": 2.7891, + "step": 4810500 + }, + { + "epoch": 1.4955694028447217, + "grad_norm": 8.24197006225586, + "learning_rate": 2.5073843285921306e-05, + "loss": 2.7949, + "step": 4811000 + }, + { + "epoch": 1.4957248351252086, + "grad_norm": 8.632169723510742, + "learning_rate": 2.5071252747913193e-05, + "loss": 2.7717, + "step": 4811500 + }, + { + "epoch": 1.4958802674056955, + "grad_norm": 8.348494529724121, + "learning_rate": 2.506866220990508e-05, + "loss": 2.7848, + "step": 4812000 + }, + { + "epoch": 1.4960356996861823, + "grad_norm": 9.835138320922852, + "learning_rate": 2.5066071671896964e-05, + "loss": 2.7292, + "step": 4812500 + }, + { + "epoch": 1.4961911319666692, + "grad_norm": 8.355951309204102, + "learning_rate": 2.506348113388885e-05, + "loss": 2.8097, + "step": 4813000 + }, + { + "epoch": 1.496346564247156, + "grad_norm": 10.910604476928711, + "learning_rate": 2.5060890595880738e-05, + "loss": 2.8337, + "step": 4813500 + }, + { + "epoch": 1.496501996527643, + "grad_norm": 8.750630378723145, + "learning_rate": 2.505830005787262e-05, + "loss": 2.7648, + "step": 4814000 + }, + { + "epoch": 1.4966574288081298, + "grad_norm": 8.150300979614258, + "learning_rate": 2.5055709519864506e-05, + "loss": 2.778, + "step": 4814500 + }, + { + "epoch": 1.4968128610886167, + "grad_norm": 8.121079444885254, + "learning_rate": 2.5053118981856393e-05, + "loss": 2.7704, + "step": 4815000 + }, + { + "epoch": 1.4969682933691035, + "grad_norm": 469.6250915527344, + "learning_rate": 2.5050528443848276e-05, + "loss": 2.8004, + "step": 4815500 + }, + { + "epoch": 1.4971237256495904, + "grad_norm": 8.567000389099121, + "learning_rate": 2.5047937905840164e-05, + "loss": 2.8047, + "step": 4816000 + }, + { + "epoch": 1.4972791579300773, + "grad_norm": 7.608259677886963, + "learning_rate": 2.5045347367832044e-05, + "loss": 2.7831, + "step": 4816500 + }, + { + "epoch": 1.4974345902105641, + "grad_norm": 9.938404083251953, + "learning_rate": 2.504275682982393e-05, + "loss": 2.7891, + "step": 4817000 + }, + { + "epoch": 1.497590022491051, + "grad_norm": 8.952054977416992, + "learning_rate": 2.5040166291815818e-05, + "loss": 2.7854, + "step": 4817500 + }, + { + "epoch": 1.4977454547715379, + "grad_norm": 9.047266006469727, + "learning_rate": 2.5037575753807702e-05, + "loss": 2.7478, + "step": 4818000 + }, + { + "epoch": 1.4979008870520247, + "grad_norm": 9.405735969543457, + "learning_rate": 2.503498521579959e-05, + "loss": 2.8056, + "step": 4818500 + }, + { + "epoch": 1.4980563193325116, + "grad_norm": 10.873817443847656, + "learning_rate": 2.5032394677791476e-05, + "loss": 2.7504, + "step": 4819000 + }, + { + "epoch": 1.4982117516129985, + "grad_norm": 11.515939712524414, + "learning_rate": 2.5029804139783357e-05, + "loss": 2.786, + "step": 4819500 + }, + { + "epoch": 1.4983671838934853, + "grad_norm": 8.423455238342285, + "learning_rate": 2.5027213601775247e-05, + "loss": 2.7525, + "step": 4820000 + }, + { + "epoch": 1.4985226161739722, + "grad_norm": 10.858292579650879, + "learning_rate": 2.5024623063767134e-05, + "loss": 2.7419, + "step": 4820500 + }, + { + "epoch": 1.498678048454459, + "grad_norm": 11.320157051086426, + "learning_rate": 2.5022032525759015e-05, + "loss": 2.7997, + "step": 4821000 + }, + { + "epoch": 1.498833480734946, + "grad_norm": 10.674710273742676, + "learning_rate": 2.5019441987750902e-05, + "loss": 2.7335, + "step": 4821500 + }, + { + "epoch": 1.4989889130154328, + "grad_norm": 8.422795295715332, + "learning_rate": 2.5016851449742786e-05, + "loss": 2.815, + "step": 4822000 + }, + { + "epoch": 1.4991443452959197, + "grad_norm": 10.65086555480957, + "learning_rate": 2.5014260911734673e-05, + "loss": 2.8129, + "step": 4822500 + }, + { + "epoch": 1.4992997775764065, + "grad_norm": 8.413203239440918, + "learning_rate": 2.501167037372656e-05, + "loss": 2.8084, + "step": 4823000 + }, + { + "epoch": 1.4994552098568934, + "grad_norm": 6.152715682983398, + "learning_rate": 2.500907983571844e-05, + "loss": 2.7864, + "step": 4823500 + }, + { + "epoch": 1.4996106421373803, + "grad_norm": 14.90713882446289, + "learning_rate": 2.5006489297710328e-05, + "loss": 2.7674, + "step": 4824000 + }, + { + "epoch": 1.4997660744178671, + "grad_norm": 9.288939476013184, + "learning_rate": 2.5003898759702215e-05, + "loss": 2.792, + "step": 4824500 + }, + { + "epoch": 1.499921506698354, + "grad_norm": 10.952105522155762, + "learning_rate": 2.50013082216941e-05, + "loss": 2.7463, + "step": 4825000 + }, + { + "epoch": 1.5000769389788409, + "grad_norm": 11.038841247558594, + "learning_rate": 2.4998717683685986e-05, + "loss": 2.8166, + "step": 4825500 + }, + { + "epoch": 1.5002323712593277, + "grad_norm": 8.961341857910156, + "learning_rate": 2.499612714567787e-05, + "loss": 2.7814, + "step": 4826000 + }, + { + "epoch": 1.5003878035398146, + "grad_norm": 10.21471118927002, + "learning_rate": 2.4993536607669753e-05, + "loss": 2.7736, + "step": 4826500 + }, + { + "epoch": 1.5005432358203015, + "grad_norm": 11.546725273132324, + "learning_rate": 2.499094606966164e-05, + "loss": 2.7956, + "step": 4827000 + }, + { + "epoch": 1.5006986681007886, + "grad_norm": 11.335489273071289, + "learning_rate": 2.4988355531653527e-05, + "loss": 2.7853, + "step": 4827500 + }, + { + "epoch": 1.5008541003812754, + "grad_norm": 8.348020553588867, + "learning_rate": 2.498576499364541e-05, + "loss": 2.7789, + "step": 4828000 + }, + { + "epoch": 1.5010095326617623, + "grad_norm": 12.617910385131836, + "learning_rate": 2.49831744556373e-05, + "loss": 2.7928, + "step": 4828500 + }, + { + "epoch": 1.5011649649422492, + "grad_norm": 11.162732124328613, + "learning_rate": 2.4980583917629182e-05, + "loss": 2.7521, + "step": 4829000 + }, + { + "epoch": 1.501320397222736, + "grad_norm": 10.095220565795898, + "learning_rate": 2.4977993379621066e-05, + "loss": 2.7785, + "step": 4829500 + }, + { + "epoch": 1.501475829503223, + "grad_norm": 9.685508728027344, + "learning_rate": 2.4975402841612953e-05, + "loss": 2.7807, + "step": 4830000 + }, + { + "epoch": 1.5016312617837098, + "grad_norm": 8.38416862487793, + "learning_rate": 2.497281230360484e-05, + "loss": 2.7899, + "step": 4830500 + }, + { + "epoch": 1.5017866940641966, + "grad_norm": 24.044912338256836, + "learning_rate": 2.4970221765596724e-05, + "loss": 2.7344, + "step": 4831000 + }, + { + "epoch": 1.5019421263446835, + "grad_norm": 29.617000579833984, + "learning_rate": 2.4967631227588608e-05, + "loss": 2.7754, + "step": 4831500 + }, + { + "epoch": 1.5020975586251704, + "grad_norm": 59.36172866821289, + "learning_rate": 2.4965040689580495e-05, + "loss": 2.7739, + "step": 4832000 + }, + { + "epoch": 1.5022529909056572, + "grad_norm": 9.563129425048828, + "learning_rate": 2.4962450151572382e-05, + "loss": 2.7225, + "step": 4832500 + }, + { + "epoch": 1.502408423186144, + "grad_norm": 13.494993209838867, + "learning_rate": 2.4959859613564266e-05, + "loss": 2.7891, + "step": 4833000 + }, + { + "epoch": 1.5025638554666312, + "grad_norm": 8.80113697052002, + "learning_rate": 2.495726907555615e-05, + "loss": 2.7998, + "step": 4833500 + }, + { + "epoch": 1.502719287747118, + "grad_norm": 10.37382698059082, + "learning_rate": 2.4954678537548037e-05, + "loss": 2.7806, + "step": 4834000 + }, + { + "epoch": 1.502874720027605, + "grad_norm": 9.336868286132812, + "learning_rate": 2.495208799953992e-05, + "loss": 2.7764, + "step": 4834500 + }, + { + "epoch": 1.5030301523080918, + "grad_norm": 8.187996864318848, + "learning_rate": 2.4949497461531808e-05, + "loss": 2.7751, + "step": 4835000 + }, + { + "epoch": 1.5031855845885786, + "grad_norm": 9.730140686035156, + "learning_rate": 2.494690692352369e-05, + "loss": 2.7839, + "step": 4835500 + }, + { + "epoch": 1.5033410168690655, + "grad_norm": 11.64090347290039, + "learning_rate": 2.494431638551558e-05, + "loss": 2.8039, + "step": 4836000 + }, + { + "epoch": 1.5034964491495524, + "grad_norm": 7.218530654907227, + "learning_rate": 2.4941725847507462e-05, + "loss": 2.7652, + "step": 4836500 + }, + { + "epoch": 1.5036518814300392, + "grad_norm": 8.357902526855469, + "learning_rate": 2.493913530949935e-05, + "loss": 2.7405, + "step": 4837000 + }, + { + "epoch": 1.5038073137105261, + "grad_norm": 11.036425590515137, + "learning_rate": 2.4936544771491237e-05, + "loss": 2.7686, + "step": 4837500 + }, + { + "epoch": 1.503962745991013, + "grad_norm": 11.621740341186523, + "learning_rate": 2.493395423348312e-05, + "loss": 2.7726, + "step": 4838000 + }, + { + "epoch": 1.5041181782714999, + "grad_norm": 8.350508689880371, + "learning_rate": 2.4931363695475004e-05, + "loss": 2.7849, + "step": 4838500 + }, + { + "epoch": 1.5042736105519867, + "grad_norm": 11.709531784057617, + "learning_rate": 2.4928773157466888e-05, + "loss": 2.751, + "step": 4839000 + }, + { + "epoch": 1.5044290428324736, + "grad_norm": 9.228922843933105, + "learning_rate": 2.4926182619458775e-05, + "loss": 2.7659, + "step": 4839500 + }, + { + "epoch": 1.5045844751129605, + "grad_norm": 9.650333404541016, + "learning_rate": 2.4923592081450662e-05, + "loss": 2.7983, + "step": 4840000 + }, + { + "epoch": 1.5047399073934473, + "grad_norm": 8.206274032592773, + "learning_rate": 2.4921001543442546e-05, + "loss": 2.7714, + "step": 4840500 + }, + { + "epoch": 1.5048953396739342, + "grad_norm": 8.544745445251465, + "learning_rate": 2.491841100543443e-05, + "loss": 2.811, + "step": 4841000 + }, + { + "epoch": 1.505050771954421, + "grad_norm": 10.707390785217285, + "learning_rate": 2.4915820467426317e-05, + "loss": 2.7291, + "step": 4841500 + }, + { + "epoch": 1.505206204234908, + "grad_norm": 9.380297660827637, + "learning_rate": 2.4913229929418204e-05, + "loss": 2.7202, + "step": 4842000 + }, + { + "epoch": 1.5053616365153948, + "grad_norm": 10.386192321777344, + "learning_rate": 2.4910639391410088e-05, + "loss": 2.7481, + "step": 4842500 + }, + { + "epoch": 1.5055170687958817, + "grad_norm": 9.044313430786133, + "learning_rate": 2.4908048853401975e-05, + "loss": 2.7608, + "step": 4843000 + }, + { + "epoch": 1.5056725010763685, + "grad_norm": 8.91639232635498, + "learning_rate": 2.490545831539386e-05, + "loss": 2.7703, + "step": 4843500 + }, + { + "epoch": 1.5058279333568554, + "grad_norm": 11.770174026489258, + "learning_rate": 2.4902867777385743e-05, + "loss": 2.7754, + "step": 4844000 + }, + { + "epoch": 1.5059833656373423, + "grad_norm": 8.436638832092285, + "learning_rate": 2.490027723937763e-05, + "loss": 2.7473, + "step": 4844500 + }, + { + "epoch": 1.5061387979178291, + "grad_norm": 8.730281829833984, + "learning_rate": 2.4897686701369517e-05, + "loss": 2.7997, + "step": 4845000 + }, + { + "epoch": 1.506294230198316, + "grad_norm": 9.888775825500488, + "learning_rate": 2.48950961633614e-05, + "loss": 2.8151, + "step": 4845500 + }, + { + "epoch": 1.5064496624788029, + "grad_norm": 7.591353893280029, + "learning_rate": 2.4892505625353284e-05, + "loss": 2.7783, + "step": 4846000 + }, + { + "epoch": 1.5066050947592897, + "grad_norm": 15.725878715515137, + "learning_rate": 2.488991508734517e-05, + "loss": 2.7634, + "step": 4846500 + }, + { + "epoch": 1.5067605270397766, + "grad_norm": 8.454747200012207, + "learning_rate": 2.488732454933706e-05, + "loss": 2.7603, + "step": 4847000 + }, + { + "epoch": 1.5069159593202635, + "grad_norm": 8.754606246948242, + "learning_rate": 2.4884734011328942e-05, + "loss": 2.764, + "step": 4847500 + }, + { + "epoch": 1.5070713916007503, + "grad_norm": 10.070343971252441, + "learning_rate": 2.4882143473320826e-05, + "loss": 2.7548, + "step": 4848000 + }, + { + "epoch": 1.5072268238812372, + "grad_norm": 26.520692825317383, + "learning_rate": 2.4879552935312713e-05, + "loss": 2.8082, + "step": 4848500 + }, + { + "epoch": 1.507382256161724, + "grad_norm": 8.414320945739746, + "learning_rate": 2.4876962397304597e-05, + "loss": 2.798, + "step": 4849000 + }, + { + "epoch": 1.507537688442211, + "grad_norm": 12.903173446655273, + "learning_rate": 2.4874371859296484e-05, + "loss": 2.7788, + "step": 4849500 + }, + { + "epoch": 1.5076931207226978, + "grad_norm": 8.194938659667969, + "learning_rate": 2.4871781321288368e-05, + "loss": 2.7089, + "step": 4850000 + }, + { + "epoch": 1.5078485530031847, + "grad_norm": 10.813254356384277, + "learning_rate": 2.4869190783280255e-05, + "loss": 2.755, + "step": 4850500 + }, + { + "epoch": 1.5080039852836715, + "grad_norm": 11.265710830688477, + "learning_rate": 2.486660024527214e-05, + "loss": 2.7339, + "step": 4851000 + }, + { + "epoch": 1.5081594175641586, + "grad_norm": 8.378079414367676, + "learning_rate": 2.4864009707264023e-05, + "loss": 2.7661, + "step": 4851500 + }, + { + "epoch": 1.5083148498446455, + "grad_norm": 8.66795539855957, + "learning_rate": 2.4861419169255913e-05, + "loss": 2.7705, + "step": 4852000 + }, + { + "epoch": 1.5084702821251323, + "grad_norm": 8.910170555114746, + "learning_rate": 2.4858828631247797e-05, + "loss": 2.8037, + "step": 4852500 + }, + { + "epoch": 1.5086257144056192, + "grad_norm": 8.745684623718262, + "learning_rate": 2.485623809323968e-05, + "loss": 2.7987, + "step": 4853000 + }, + { + "epoch": 1.508781146686106, + "grad_norm": 9.173192977905273, + "learning_rate": 2.4853647555231565e-05, + "loss": 2.7777, + "step": 4853500 + }, + { + "epoch": 1.508936578966593, + "grad_norm": 8.26269245147705, + "learning_rate": 2.4851057017223452e-05, + "loss": 2.8016, + "step": 4854000 + }, + { + "epoch": 1.5090920112470798, + "grad_norm": 8.620999336242676, + "learning_rate": 2.484846647921534e-05, + "loss": 2.7986, + "step": 4854500 + }, + { + "epoch": 1.5092474435275667, + "grad_norm": 9.299824714660645, + "learning_rate": 2.4845875941207223e-05, + "loss": 2.7932, + "step": 4855000 + }, + { + "epoch": 1.5094028758080535, + "grad_norm": 9.56137466430664, + "learning_rate": 2.484328540319911e-05, + "loss": 2.8163, + "step": 4855500 + }, + { + "epoch": 1.5095583080885404, + "grad_norm": 7.349072456359863, + "learning_rate": 2.4840694865190994e-05, + "loss": 2.7475, + "step": 4856000 + }, + { + "epoch": 1.5097137403690273, + "grad_norm": 8.411843299865723, + "learning_rate": 2.4838104327182877e-05, + "loss": 2.7689, + "step": 4856500 + }, + { + "epoch": 1.5098691726495141, + "grad_norm": 11.444055557250977, + "learning_rate": 2.4835513789174764e-05, + "loss": 2.7727, + "step": 4857000 + }, + { + "epoch": 1.5100246049300012, + "grad_norm": 12.687554359436035, + "learning_rate": 2.483292325116665e-05, + "loss": 2.7776, + "step": 4857500 + }, + { + "epoch": 1.510180037210488, + "grad_norm": 7.3442301750183105, + "learning_rate": 2.4830332713158535e-05, + "loss": 2.8074, + "step": 4858000 + }, + { + "epoch": 1.510335469490975, + "grad_norm": 14.309046745300293, + "learning_rate": 2.482774217515042e-05, + "loss": 2.8105, + "step": 4858500 + }, + { + "epoch": 1.5104909017714618, + "grad_norm": 8.794824600219727, + "learning_rate": 2.4825151637142306e-05, + "loss": 2.7335, + "step": 4859000 + }, + { + "epoch": 1.5106463340519487, + "grad_norm": 10.098311424255371, + "learning_rate": 2.4822561099134193e-05, + "loss": 2.7647, + "step": 4859500 + }, + { + "epoch": 1.5108017663324356, + "grad_norm": 8.535257339477539, + "learning_rate": 2.4819970561126077e-05, + "loss": 2.7345, + "step": 4860000 + }, + { + "epoch": 1.5109571986129224, + "grad_norm": 7.586628437042236, + "learning_rate": 2.481738002311796e-05, + "loss": 2.7473, + "step": 4860500 + }, + { + "epoch": 1.5111126308934093, + "grad_norm": 11.424813270568848, + "learning_rate": 2.4814789485109848e-05, + "loss": 2.7551, + "step": 4861000 + }, + { + "epoch": 1.5112680631738962, + "grad_norm": 10.892036437988281, + "learning_rate": 2.4812198947101732e-05, + "loss": 2.7912, + "step": 4861500 + }, + { + "epoch": 1.511423495454383, + "grad_norm": 9.908403396606445, + "learning_rate": 2.480960840909362e-05, + "loss": 2.7334, + "step": 4862000 + }, + { + "epoch": 1.51157892773487, + "grad_norm": 8.532099723815918, + "learning_rate": 2.4807017871085503e-05, + "loss": 2.7909, + "step": 4862500 + }, + { + "epoch": 1.5117343600153568, + "grad_norm": 27.02155113220215, + "learning_rate": 2.480442733307739e-05, + "loss": 2.7809, + "step": 4863000 + }, + { + "epoch": 1.5118897922958436, + "grad_norm": 9.411417007446289, + "learning_rate": 2.4801836795069274e-05, + "loss": 2.8052, + "step": 4863500 + }, + { + "epoch": 1.5120452245763305, + "grad_norm": 8.501365661621094, + "learning_rate": 2.479924625706116e-05, + "loss": 2.7691, + "step": 4864000 + }, + { + "epoch": 1.5122006568568174, + "grad_norm": 30.46397590637207, + "learning_rate": 2.4796655719053048e-05, + "loss": 2.7607, + "step": 4864500 + }, + { + "epoch": 1.5123560891373042, + "grad_norm": 8.792203903198242, + "learning_rate": 2.4794065181044932e-05, + "loss": 2.7559, + "step": 4865000 + }, + { + "epoch": 1.512511521417791, + "grad_norm": 8.64815902709961, + "learning_rate": 2.4791474643036816e-05, + "loss": 2.7901, + "step": 4865500 + }, + { + "epoch": 1.512666953698278, + "grad_norm": 25.42246437072754, + "learning_rate": 2.47888841050287e-05, + "loss": 2.7783, + "step": 4866000 + }, + { + "epoch": 1.5128223859787648, + "grad_norm": 8.692898750305176, + "learning_rate": 2.4786293567020587e-05, + "loss": 2.7765, + "step": 4866500 + }, + { + "epoch": 1.5129778182592517, + "grad_norm": 10.038673400878906, + "learning_rate": 2.4783703029012474e-05, + "loss": 2.7352, + "step": 4867000 + }, + { + "epoch": 1.5131332505397386, + "grad_norm": 10.688804626464844, + "learning_rate": 2.4781112491004357e-05, + "loss": 2.7695, + "step": 4867500 + }, + { + "epoch": 1.5132886828202254, + "grad_norm": 9.317644119262695, + "learning_rate": 2.477852195299624e-05, + "loss": 2.771, + "step": 4868000 + }, + { + "epoch": 1.5134441151007123, + "grad_norm": 8.426224708557129, + "learning_rate": 2.477593141498813e-05, + "loss": 2.7829, + "step": 4868500 + }, + { + "epoch": 1.5135995473811992, + "grad_norm": 10.462910652160645, + "learning_rate": 2.4773340876980016e-05, + "loss": 2.8226, + "step": 4869000 + }, + { + "epoch": 1.513754979661686, + "grad_norm": 9.512361526489258, + "learning_rate": 2.47707503389719e-05, + "loss": 2.7674, + "step": 4869500 + }, + { + "epoch": 1.513910411942173, + "grad_norm": 14.994911193847656, + "learning_rate": 2.4768159800963786e-05, + "loss": 2.7328, + "step": 4870000 + }, + { + "epoch": 1.5140658442226598, + "grad_norm": 8.17922592163086, + "learning_rate": 2.476556926295567e-05, + "loss": 2.7722, + "step": 4870500 + }, + { + "epoch": 1.5142212765031466, + "grad_norm": 10.09383487701416, + "learning_rate": 2.4762978724947554e-05, + "loss": 2.8007, + "step": 4871000 + }, + { + "epoch": 1.5143767087836335, + "grad_norm": 9.91702651977539, + "learning_rate": 2.476038818693944e-05, + "loss": 2.7618, + "step": 4871500 + }, + { + "epoch": 1.5145321410641204, + "grad_norm": 7.999060153961182, + "learning_rate": 2.4757797648931328e-05, + "loss": 2.7509, + "step": 4872000 + }, + { + "epoch": 1.5146875733446072, + "grad_norm": 7.7568559646606445, + "learning_rate": 2.4755207110923212e-05, + "loss": 2.7379, + "step": 4872500 + }, + { + "epoch": 1.514843005625094, + "grad_norm": 62.694847106933594, + "learning_rate": 2.4752616572915096e-05, + "loss": 2.7693, + "step": 4873000 + }, + { + "epoch": 1.514998437905581, + "grad_norm": 8.734769821166992, + "learning_rate": 2.4750026034906983e-05, + "loss": 2.8088, + "step": 4873500 + }, + { + "epoch": 1.5151538701860678, + "grad_norm": 9.469022750854492, + "learning_rate": 2.474743549689887e-05, + "loss": 2.7601, + "step": 4874000 + }, + { + "epoch": 1.5153093024665547, + "grad_norm": 8.007719039916992, + "learning_rate": 2.4744844958890754e-05, + "loss": 2.7621, + "step": 4874500 + }, + { + "epoch": 1.5154647347470416, + "grad_norm": 26.770294189453125, + "learning_rate": 2.4742254420882638e-05, + "loss": 2.8125, + "step": 4875000 + }, + { + "epoch": 1.5156201670275287, + "grad_norm": 8.937703132629395, + "learning_rate": 2.4739663882874525e-05, + "loss": 2.7732, + "step": 4875500 + }, + { + "epoch": 1.5157755993080155, + "grad_norm": 7.938660621643066, + "learning_rate": 2.473707334486641e-05, + "loss": 2.7537, + "step": 4876000 + }, + { + "epoch": 1.5159310315885024, + "grad_norm": 8.206975936889648, + "learning_rate": 2.4734482806858296e-05, + "loss": 2.7733, + "step": 4876500 + }, + { + "epoch": 1.5160864638689893, + "grad_norm": 26.981277465820312, + "learning_rate": 2.473189226885018e-05, + "loss": 2.7915, + "step": 4877000 + }, + { + "epoch": 1.5162418961494761, + "grad_norm": 9.916196823120117, + "learning_rate": 2.4729301730842067e-05, + "loss": 2.7587, + "step": 4877500 + }, + { + "epoch": 1.516397328429963, + "grad_norm": 9.855080604553223, + "learning_rate": 2.472671119283395e-05, + "loss": 2.7758, + "step": 4878000 + }, + { + "epoch": 1.5165527607104499, + "grad_norm": 10.386467933654785, + "learning_rate": 2.4724120654825834e-05, + "loss": 2.7558, + "step": 4878500 + }, + { + "epoch": 1.5167081929909367, + "grad_norm": 8.144760131835938, + "learning_rate": 2.4721530116817725e-05, + "loss": 2.7678, + "step": 4879000 + }, + { + "epoch": 1.5168636252714236, + "grad_norm": 11.267659187316895, + "learning_rate": 2.471893957880961e-05, + "loss": 2.7889, + "step": 4879500 + }, + { + "epoch": 1.5170190575519105, + "grad_norm": 8.911103248596191, + "learning_rate": 2.4716349040801492e-05, + "loss": 2.8215, + "step": 4880000 + }, + { + "epoch": 1.5171744898323973, + "grad_norm": 9.281022071838379, + "learning_rate": 2.4713758502793376e-05, + "loss": 2.745, + "step": 4880500 + }, + { + "epoch": 1.5173299221128842, + "grad_norm": 9.97399616241455, + "learning_rate": 2.4711167964785263e-05, + "loss": 2.7218, + "step": 4881000 + }, + { + "epoch": 1.5174853543933713, + "grad_norm": 7.100555419921875, + "learning_rate": 2.470857742677715e-05, + "loss": 2.7828, + "step": 4881500 + }, + { + "epoch": 1.5176407866738582, + "grad_norm": 9.6196928024292, + "learning_rate": 2.4705986888769034e-05, + "loss": 2.777, + "step": 4882000 + }, + { + "epoch": 1.517796218954345, + "grad_norm": 7.4311418533325195, + "learning_rate": 2.470339635076092e-05, + "loss": 2.7623, + "step": 4882500 + }, + { + "epoch": 1.517951651234832, + "grad_norm": 9.687868118286133, + "learning_rate": 2.4700805812752805e-05, + "loss": 2.8273, + "step": 4883000 + }, + { + "epoch": 1.5181070835153188, + "grad_norm": 27.69037628173828, + "learning_rate": 2.469821527474469e-05, + "loss": 2.7782, + "step": 4883500 + }, + { + "epoch": 1.5182625157958056, + "grad_norm": 9.215699195861816, + "learning_rate": 2.4695624736736576e-05, + "loss": 2.8047, + "step": 4884000 + }, + { + "epoch": 1.5184179480762925, + "grad_norm": 8.161736488342285, + "learning_rate": 2.4693034198728463e-05, + "loss": 2.7327, + "step": 4884500 + }, + { + "epoch": 1.5185733803567794, + "grad_norm": 9.403947830200195, + "learning_rate": 2.4690443660720347e-05, + "loss": 2.7624, + "step": 4885000 + }, + { + "epoch": 1.5187288126372662, + "grad_norm": 9.415914535522461, + "learning_rate": 2.468785312271223e-05, + "loss": 2.7447, + "step": 4885500 + }, + { + "epoch": 1.518884244917753, + "grad_norm": 10.092719078063965, + "learning_rate": 2.4685262584704118e-05, + "loss": 2.7663, + "step": 4886000 + }, + { + "epoch": 1.51903967719824, + "grad_norm": 9.398636817932129, + "learning_rate": 2.4682672046696005e-05, + "loss": 2.7877, + "step": 4886500 + }, + { + "epoch": 1.5191951094787268, + "grad_norm": 10.381779670715332, + "learning_rate": 2.468008150868789e-05, + "loss": 2.8042, + "step": 4887000 + }, + { + "epoch": 1.5193505417592137, + "grad_norm": 22.670642852783203, + "learning_rate": 2.4677490970679772e-05, + "loss": 2.7818, + "step": 4887500 + }, + { + "epoch": 1.5195059740397006, + "grad_norm": 9.923130989074707, + "learning_rate": 2.467490043267166e-05, + "loss": 2.7721, + "step": 4888000 + }, + { + "epoch": 1.5196614063201874, + "grad_norm": 9.325897216796875, + "learning_rate": 2.4672309894663543e-05, + "loss": 2.8164, + "step": 4888500 + }, + { + "epoch": 1.5198168386006743, + "grad_norm": 8.822379112243652, + "learning_rate": 2.466971935665543e-05, + "loss": 2.8196, + "step": 4889000 + }, + { + "epoch": 1.5199722708811612, + "grad_norm": 11.794462203979492, + "learning_rate": 2.4667128818647314e-05, + "loss": 2.7229, + "step": 4889500 + }, + { + "epoch": 1.520127703161648, + "grad_norm": 9.394246101379395, + "learning_rate": 2.46645382806392e-05, + "loss": 2.787, + "step": 4890000 + }, + { + "epoch": 1.520283135442135, + "grad_norm": 9.913773536682129, + "learning_rate": 2.4661947742631085e-05, + "loss": 2.8133, + "step": 4890500 + }, + { + "epoch": 1.5204385677226218, + "grad_norm": 13.206926345825195, + "learning_rate": 2.4659357204622972e-05, + "loss": 2.7772, + "step": 4891000 + }, + { + "epoch": 1.5205940000031086, + "grad_norm": 8.949934959411621, + "learning_rate": 2.465676666661486e-05, + "loss": 2.7779, + "step": 4891500 + }, + { + "epoch": 1.5207494322835955, + "grad_norm": 8.869717597961426, + "learning_rate": 2.4654176128606743e-05, + "loss": 2.8049, + "step": 4892000 + }, + { + "epoch": 1.5209048645640824, + "grad_norm": 7.111301422119141, + "learning_rate": 2.4651585590598627e-05, + "loss": 2.7427, + "step": 4892500 + }, + { + "epoch": 1.5210602968445692, + "grad_norm": 8.7931489944458, + "learning_rate": 2.464899505259051e-05, + "loss": 2.7432, + "step": 4893000 + }, + { + "epoch": 1.521215729125056, + "grad_norm": 19.5391845703125, + "learning_rate": 2.4646404514582398e-05, + "loss": 2.7568, + "step": 4893500 + }, + { + "epoch": 1.521371161405543, + "grad_norm": 9.097938537597656, + "learning_rate": 2.4643813976574285e-05, + "loss": 2.7724, + "step": 4894000 + }, + { + "epoch": 1.5215265936860298, + "grad_norm": 8.39836597442627, + "learning_rate": 2.464122343856617e-05, + "loss": 2.7976, + "step": 4894500 + }, + { + "epoch": 1.5216820259665167, + "grad_norm": 11.82336711883545, + "learning_rate": 2.4638632900558053e-05, + "loss": 2.7583, + "step": 4895000 + }, + { + "epoch": 1.5218374582470036, + "grad_norm": 14.274950981140137, + "learning_rate": 2.463604236254994e-05, + "loss": 2.7231, + "step": 4895500 + }, + { + "epoch": 1.5219928905274904, + "grad_norm": 8.122672080993652, + "learning_rate": 2.4633451824541827e-05, + "loss": 2.7609, + "step": 4896000 + }, + { + "epoch": 1.5221483228079773, + "grad_norm": 8.728693008422852, + "learning_rate": 2.463086128653371e-05, + "loss": 2.7646, + "step": 4896500 + }, + { + "epoch": 1.5223037550884642, + "grad_norm": 10.097675323486328, + "learning_rate": 2.4628270748525598e-05, + "loss": 2.7503, + "step": 4897000 + }, + { + "epoch": 1.522459187368951, + "grad_norm": 37.269500732421875, + "learning_rate": 2.462568021051748e-05, + "loss": 2.7879, + "step": 4897500 + }, + { + "epoch": 1.522614619649438, + "grad_norm": 12.181615829467773, + "learning_rate": 2.4623089672509365e-05, + "loss": 2.7709, + "step": 4898000 + }, + { + "epoch": 1.5227700519299248, + "grad_norm": 9.082796096801758, + "learning_rate": 2.4620499134501253e-05, + "loss": 2.7715, + "step": 4898500 + }, + { + "epoch": 1.5229254842104116, + "grad_norm": 10.597855567932129, + "learning_rate": 2.461790859649314e-05, + "loss": 2.7611, + "step": 4899000 + }, + { + "epoch": 1.5230809164908987, + "grad_norm": 7.974411487579346, + "learning_rate": 2.4615318058485023e-05, + "loss": 2.7995, + "step": 4899500 + }, + { + "epoch": 1.5232363487713856, + "grad_norm": 14.50156021118164, + "learning_rate": 2.4612727520476907e-05, + "loss": 2.7449, + "step": 4900000 + }, + { + "epoch": 1.5233917810518725, + "grad_norm": 26.13125228881836, + "learning_rate": 2.4610136982468794e-05, + "loss": 2.7713, + "step": 4900500 + }, + { + "epoch": 1.5235472133323593, + "grad_norm": 11.632826805114746, + "learning_rate": 2.460754644446068e-05, + "loss": 2.7275, + "step": 4901000 + }, + { + "epoch": 1.5237026456128462, + "grad_norm": 10.793046951293945, + "learning_rate": 2.4604955906452565e-05, + "loss": 2.7787, + "step": 4901500 + }, + { + "epoch": 1.523858077893333, + "grad_norm": 7.909637451171875, + "learning_rate": 2.460236536844445e-05, + "loss": 2.795, + "step": 4902000 + }, + { + "epoch": 1.52401351017382, + "grad_norm": 9.051383018493652, + "learning_rate": 2.4599774830436336e-05, + "loss": 2.7785, + "step": 4902500 + }, + { + "epoch": 1.5241689424543068, + "grad_norm": 8.732221603393555, + "learning_rate": 2.459718429242822e-05, + "loss": 2.7839, + "step": 4903000 + }, + { + "epoch": 1.5243243747347937, + "grad_norm": 9.589634895324707, + "learning_rate": 2.4594593754420107e-05, + "loss": 2.7869, + "step": 4903500 + }, + { + "epoch": 1.5244798070152805, + "grad_norm": 9.491697311401367, + "learning_rate": 2.459200321641199e-05, + "loss": 2.7326, + "step": 4904000 + }, + { + "epoch": 1.5246352392957674, + "grad_norm": 22.650590896606445, + "learning_rate": 2.4589412678403878e-05, + "loss": 2.7465, + "step": 4904500 + }, + { + "epoch": 1.5247906715762543, + "grad_norm": 10.384672164916992, + "learning_rate": 2.4586822140395762e-05, + "loss": 2.7874, + "step": 4905000 + }, + { + "epoch": 1.5249461038567413, + "grad_norm": 10.133508682250977, + "learning_rate": 2.458423160238765e-05, + "loss": 2.7846, + "step": 4905500 + }, + { + "epoch": 1.5251015361372282, + "grad_norm": 8.605276107788086, + "learning_rate": 2.4581641064379536e-05, + "loss": 2.7888, + "step": 4906000 + }, + { + "epoch": 1.525256968417715, + "grad_norm": 9.178457260131836, + "learning_rate": 2.457905052637142e-05, + "loss": 2.7643, + "step": 4906500 + }, + { + "epoch": 1.525412400698202, + "grad_norm": 13.631753921508789, + "learning_rate": 2.4576459988363304e-05, + "loss": 2.8125, + "step": 4907000 + }, + { + "epoch": 1.5255678329786888, + "grad_norm": 8.833234786987305, + "learning_rate": 2.4573869450355187e-05, + "loss": 2.7833, + "step": 4907500 + }, + { + "epoch": 1.5257232652591757, + "grad_norm": 14.317710876464844, + "learning_rate": 2.4571278912347075e-05, + "loss": 2.7765, + "step": 4908000 + }, + { + "epoch": 1.5258786975396625, + "grad_norm": 14.380549430847168, + "learning_rate": 2.4568688374338962e-05, + "loss": 2.7654, + "step": 4908500 + }, + { + "epoch": 1.5260341298201494, + "grad_norm": 9.110855102539062, + "learning_rate": 2.4566097836330845e-05, + "loss": 2.7945, + "step": 4909000 + }, + { + "epoch": 1.5261895621006363, + "grad_norm": 9.988574028015137, + "learning_rate": 2.4563507298322733e-05, + "loss": 2.7598, + "step": 4909500 + }, + { + "epoch": 1.5263449943811231, + "grad_norm": 10.109825134277344, + "learning_rate": 2.4560916760314616e-05, + "loss": 2.7888, + "step": 4910000 + }, + { + "epoch": 1.52650042666161, + "grad_norm": 44.68026351928711, + "learning_rate": 2.4558326222306504e-05, + "loss": 2.8322, + "step": 4910500 + }, + { + "epoch": 1.5266558589420969, + "grad_norm": 20.808856964111328, + "learning_rate": 2.4555735684298387e-05, + "loss": 2.7857, + "step": 4911000 + }, + { + "epoch": 1.5268112912225837, + "grad_norm": 10.726390838623047, + "learning_rate": 2.4553145146290274e-05, + "loss": 2.7702, + "step": 4911500 + }, + { + "epoch": 1.5269667235030706, + "grad_norm": 13.71649169921875, + "learning_rate": 2.4550554608282158e-05, + "loss": 2.7839, + "step": 4912000 + }, + { + "epoch": 1.5271221557835575, + "grad_norm": 5.925815582275391, + "learning_rate": 2.4547964070274042e-05, + "loss": 2.8045, + "step": 4912500 + }, + { + "epoch": 1.5272775880640443, + "grad_norm": 9.181633949279785, + "learning_rate": 2.454537353226593e-05, + "loss": 2.7184, + "step": 4913000 + }, + { + "epoch": 1.5274330203445312, + "grad_norm": 10.591597557067871, + "learning_rate": 2.4542782994257816e-05, + "loss": 2.7607, + "step": 4913500 + }, + { + "epoch": 1.527588452625018, + "grad_norm": 8.184235572814941, + "learning_rate": 2.45401924562497e-05, + "loss": 2.7684, + "step": 4914000 + }, + { + "epoch": 1.527743884905505, + "grad_norm": 10.562334060668945, + "learning_rate": 2.4537601918241584e-05, + "loss": 2.7419, + "step": 4914500 + }, + { + "epoch": 1.5278993171859918, + "grad_norm": 9.119128227233887, + "learning_rate": 2.453501138023347e-05, + "loss": 2.7641, + "step": 4915000 + }, + { + "epoch": 1.5280547494664787, + "grad_norm": 9.428504943847656, + "learning_rate": 2.4532420842225358e-05, + "loss": 2.7718, + "step": 4915500 + }, + { + "epoch": 1.5282101817469655, + "grad_norm": 10.185968399047852, + "learning_rate": 2.4529830304217242e-05, + "loss": 2.8036, + "step": 4916000 + }, + { + "epoch": 1.5283656140274524, + "grad_norm": 9.545917510986328, + "learning_rate": 2.4527239766209126e-05, + "loss": 2.7735, + "step": 4916500 + }, + { + "epoch": 1.5285210463079393, + "grad_norm": 12.227290153503418, + "learning_rate": 2.4524649228201013e-05, + "loss": 2.7355, + "step": 4917000 + }, + { + "epoch": 1.5286764785884261, + "grad_norm": 9.816524505615234, + "learning_rate": 2.4522058690192897e-05, + "loss": 2.7485, + "step": 4917500 + }, + { + "epoch": 1.528831910868913, + "grad_norm": 9.930705070495605, + "learning_rate": 2.4519468152184784e-05, + "loss": 2.7392, + "step": 4918000 + }, + { + "epoch": 1.5289873431493999, + "grad_norm": 7.5300397872924805, + "learning_rate": 2.451687761417667e-05, + "loss": 2.7898, + "step": 4918500 + }, + { + "epoch": 1.5291427754298867, + "grad_norm": 10.653768539428711, + "learning_rate": 2.4514287076168555e-05, + "loss": 2.7854, + "step": 4919000 + }, + { + "epoch": 1.5292982077103736, + "grad_norm": 11.35655403137207, + "learning_rate": 2.451169653816044e-05, + "loss": 2.8005, + "step": 4919500 + }, + { + "epoch": 1.5294536399908605, + "grad_norm": 9.103102684020996, + "learning_rate": 2.4509106000152322e-05, + "loss": 2.8282, + "step": 4920000 + }, + { + "epoch": 1.5296090722713473, + "grad_norm": 8.550786972045898, + "learning_rate": 2.4506515462144213e-05, + "loss": 2.7895, + "step": 4920500 + }, + { + "epoch": 1.5297645045518342, + "grad_norm": 8.155983924865723, + "learning_rate": 2.4503924924136097e-05, + "loss": 2.7742, + "step": 4921000 + }, + { + "epoch": 1.529919936832321, + "grad_norm": 14.579524040222168, + "learning_rate": 2.450133438612798e-05, + "loss": 2.7738, + "step": 4921500 + }, + { + "epoch": 1.530075369112808, + "grad_norm": 8.38681697845459, + "learning_rate": 2.4498743848119867e-05, + "loss": 2.7551, + "step": 4922000 + }, + { + "epoch": 1.5302308013932948, + "grad_norm": 8.949326515197754, + "learning_rate": 2.449615331011175e-05, + "loss": 2.8001, + "step": 4922500 + }, + { + "epoch": 1.5303862336737817, + "grad_norm": 9.590889930725098, + "learning_rate": 2.449356277210364e-05, + "loss": 2.7815, + "step": 4923000 + }, + { + "epoch": 1.5305416659542685, + "grad_norm": 10.452917098999023, + "learning_rate": 2.4490972234095522e-05, + "loss": 2.7962, + "step": 4923500 + }, + { + "epoch": 1.5306970982347556, + "grad_norm": 8.7258939743042, + "learning_rate": 2.448838169608741e-05, + "loss": 2.8157, + "step": 4924000 + }, + { + "epoch": 1.5308525305152425, + "grad_norm": 7.72509765625, + "learning_rate": 2.4485791158079293e-05, + "loss": 2.8028, + "step": 4924500 + }, + { + "epoch": 1.5310079627957294, + "grad_norm": 8.491610527038574, + "learning_rate": 2.4483200620071177e-05, + "loss": 2.7395, + "step": 4925000 + }, + { + "epoch": 1.5311633950762162, + "grad_norm": 9.061629295349121, + "learning_rate": 2.4480610082063064e-05, + "loss": 2.7781, + "step": 4925500 + }, + { + "epoch": 1.531318827356703, + "grad_norm": 9.472660064697266, + "learning_rate": 2.447801954405495e-05, + "loss": 2.8437, + "step": 4926000 + }, + { + "epoch": 1.53147425963719, + "grad_norm": 7.4244303703308105, + "learning_rate": 2.4475429006046835e-05, + "loss": 2.7505, + "step": 4926500 + }, + { + "epoch": 1.5316296919176768, + "grad_norm": 11.382597923278809, + "learning_rate": 2.447283846803872e-05, + "loss": 2.7851, + "step": 4927000 + }, + { + "epoch": 1.5317851241981637, + "grad_norm": 9.32883358001709, + "learning_rate": 2.4470247930030606e-05, + "loss": 2.8185, + "step": 4927500 + }, + { + "epoch": 1.5319405564786506, + "grad_norm": 7.586179733276367, + "learning_rate": 2.4467657392022493e-05, + "loss": 2.7982, + "step": 4928000 + }, + { + "epoch": 1.5320959887591374, + "grad_norm": 88.84709930419922, + "learning_rate": 2.4465066854014377e-05, + "loss": 2.7564, + "step": 4928500 + }, + { + "epoch": 1.5322514210396243, + "grad_norm": 12.539758682250977, + "learning_rate": 2.446247631600626e-05, + "loss": 2.7884, + "step": 4929000 + }, + { + "epoch": 1.5324068533201112, + "grad_norm": 10.915931701660156, + "learning_rate": 2.4459885777998148e-05, + "loss": 2.7436, + "step": 4929500 + }, + { + "epoch": 1.5325622856005983, + "grad_norm": 9.697439193725586, + "learning_rate": 2.445729523999003e-05, + "loss": 2.7556, + "step": 4930000 + }, + { + "epoch": 1.5327177178810851, + "grad_norm": 18.974382400512695, + "learning_rate": 2.445470470198192e-05, + "loss": 2.8009, + "step": 4930500 + }, + { + "epoch": 1.532873150161572, + "grad_norm": 9.563976287841797, + "learning_rate": 2.4452114163973806e-05, + "loss": 2.776, + "step": 4931000 + }, + { + "epoch": 1.5330285824420589, + "grad_norm": 8.53972053527832, + "learning_rate": 2.444952362596569e-05, + "loss": 2.7501, + "step": 4931500 + }, + { + "epoch": 1.5331840147225457, + "grad_norm": 7.255298137664795, + "learning_rate": 2.4446933087957573e-05, + "loss": 2.7525, + "step": 4932000 + }, + { + "epoch": 1.5333394470030326, + "grad_norm": 22.206663131713867, + "learning_rate": 2.444434254994946e-05, + "loss": 2.7905, + "step": 4932500 + }, + { + "epoch": 1.5334948792835195, + "grad_norm": 9.445939064025879, + "learning_rate": 2.4441752011941348e-05, + "loss": 2.7806, + "step": 4933000 + }, + { + "epoch": 1.5336503115640063, + "grad_norm": 7.933517932891846, + "learning_rate": 2.443916147393323e-05, + "loss": 2.7878, + "step": 4933500 + }, + { + "epoch": 1.5338057438444932, + "grad_norm": 9.765817642211914, + "learning_rate": 2.4436570935925115e-05, + "loss": 2.7635, + "step": 4934000 + }, + { + "epoch": 1.53396117612498, + "grad_norm": 10.054190635681152, + "learning_rate": 2.4433980397917e-05, + "loss": 2.729, + "step": 4934500 + }, + { + "epoch": 1.534116608405467, + "grad_norm": 8.424949645996094, + "learning_rate": 2.4431389859908886e-05, + "loss": 2.8164, + "step": 4935000 + }, + { + "epoch": 1.5342720406859538, + "grad_norm": 9.392502784729004, + "learning_rate": 2.4428799321900773e-05, + "loss": 2.7721, + "step": 4935500 + }, + { + "epoch": 1.5344274729664407, + "grad_norm": 16.69325828552246, + "learning_rate": 2.4426208783892657e-05, + "loss": 2.748, + "step": 4936000 + }, + { + "epoch": 1.5345829052469275, + "grad_norm": 8.10085678100586, + "learning_rate": 2.4423618245884544e-05, + "loss": 2.7853, + "step": 4936500 + }, + { + "epoch": 1.5347383375274144, + "grad_norm": 8.862125396728516, + "learning_rate": 2.4421027707876428e-05, + "loss": 2.7548, + "step": 4937000 + }, + { + "epoch": 1.5348937698079013, + "grad_norm": 8.191340446472168, + "learning_rate": 2.4418437169868315e-05, + "loss": 2.781, + "step": 4937500 + }, + { + "epoch": 1.5350492020883881, + "grad_norm": 7.90634822845459, + "learning_rate": 2.44158466318602e-05, + "loss": 2.7377, + "step": 4938000 + }, + { + "epoch": 1.535204634368875, + "grad_norm": 10.4783353805542, + "learning_rate": 2.4413256093852086e-05, + "loss": 2.7742, + "step": 4938500 + }, + { + "epoch": 1.5353600666493619, + "grad_norm": 7.837668418884277, + "learning_rate": 2.441066555584397e-05, + "loss": 2.8252, + "step": 4939000 + }, + { + "epoch": 1.5355154989298487, + "grad_norm": 7.251586437225342, + "learning_rate": 2.4408075017835853e-05, + "loss": 2.7809, + "step": 4939500 + }, + { + "epoch": 1.5356709312103356, + "grad_norm": 8.683730125427246, + "learning_rate": 2.440548447982774e-05, + "loss": 2.7476, + "step": 4940000 + }, + { + "epoch": 1.5358263634908225, + "grad_norm": 11.173558235168457, + "learning_rate": 2.4402893941819628e-05, + "loss": 2.7688, + "step": 4940500 + }, + { + "epoch": 1.5359817957713093, + "grad_norm": 7.6596999168396, + "learning_rate": 2.440030340381151e-05, + "loss": 2.759, + "step": 4941000 + }, + { + "epoch": 1.5361372280517962, + "grad_norm": 10.28685474395752, + "learning_rate": 2.4397712865803395e-05, + "loss": 2.7665, + "step": 4941500 + }, + { + "epoch": 1.536292660332283, + "grad_norm": 10.129429817199707, + "learning_rate": 2.4395122327795282e-05, + "loss": 2.8009, + "step": 4942000 + }, + { + "epoch": 1.53644809261277, + "grad_norm": 9.104277610778809, + "learning_rate": 2.439253178978717e-05, + "loss": 2.7808, + "step": 4942500 + }, + { + "epoch": 1.5366035248932568, + "grad_norm": 11.41936206817627, + "learning_rate": 2.4389941251779053e-05, + "loss": 2.8088, + "step": 4943000 + }, + { + "epoch": 1.5367589571737437, + "grad_norm": 11.083914756774902, + "learning_rate": 2.4387350713770937e-05, + "loss": 2.7811, + "step": 4943500 + }, + { + "epoch": 1.5369143894542305, + "grad_norm": 7.8374247550964355, + "learning_rate": 2.4384760175762824e-05, + "loss": 2.7945, + "step": 4944000 + }, + { + "epoch": 1.5370698217347174, + "grad_norm": 10.984819412231445, + "learning_rate": 2.4382169637754708e-05, + "loss": 2.7572, + "step": 4944500 + }, + { + "epoch": 1.5372252540152043, + "grad_norm": 9.201117515563965, + "learning_rate": 2.4379579099746595e-05, + "loss": 2.7347, + "step": 4945000 + }, + { + "epoch": 1.5373806862956911, + "grad_norm": 11.524507522583008, + "learning_rate": 2.4376988561738482e-05, + "loss": 2.8081, + "step": 4945500 + }, + { + "epoch": 1.537536118576178, + "grad_norm": 8.590156555175781, + "learning_rate": 2.4374398023730366e-05, + "loss": 2.7983, + "step": 4946000 + }, + { + "epoch": 1.5376915508566649, + "grad_norm": 9.704445838928223, + "learning_rate": 2.437180748572225e-05, + "loss": 2.7913, + "step": 4946500 + }, + { + "epoch": 1.5378469831371517, + "grad_norm": 11.652613639831543, + "learning_rate": 2.4369216947714134e-05, + "loss": 2.7648, + "step": 4947000 + }, + { + "epoch": 1.5380024154176386, + "grad_norm": 12.484148979187012, + "learning_rate": 2.4366626409706024e-05, + "loss": 2.8184, + "step": 4947500 + }, + { + "epoch": 1.5381578476981257, + "grad_norm": 10.157194137573242, + "learning_rate": 2.4364035871697908e-05, + "loss": 2.7797, + "step": 4948000 + }, + { + "epoch": 1.5383132799786126, + "grad_norm": 9.199995994567871, + "learning_rate": 2.436144533368979e-05, + "loss": 2.8072, + "step": 4948500 + }, + { + "epoch": 1.5384687122590994, + "grad_norm": 9.806750297546387, + "learning_rate": 2.435885479568168e-05, + "loss": 2.7588, + "step": 4949000 + }, + { + "epoch": 1.5386241445395863, + "grad_norm": 10.265429496765137, + "learning_rate": 2.4356264257673563e-05, + "loss": 2.7916, + "step": 4949500 + }, + { + "epoch": 1.5387795768200732, + "grad_norm": 8.823355674743652, + "learning_rate": 2.435367371966545e-05, + "loss": 2.7729, + "step": 4950000 + }, + { + "epoch": 1.53893500910056, + "grad_norm": 9.759532928466797, + "learning_rate": 2.4351083181657334e-05, + "loss": 2.78, + "step": 4950500 + }, + { + "epoch": 1.539090441381047, + "grad_norm": 11.283797264099121, + "learning_rate": 2.434849264364922e-05, + "loss": 2.7287, + "step": 4951000 + }, + { + "epoch": 1.5392458736615338, + "grad_norm": 9.891111373901367, + "learning_rate": 2.4345902105641104e-05, + "loss": 2.7445, + "step": 4951500 + }, + { + "epoch": 1.5394013059420206, + "grad_norm": 14.073963165283203, + "learning_rate": 2.4343311567632988e-05, + "loss": 2.7731, + "step": 4952000 + }, + { + "epoch": 1.5395567382225075, + "grad_norm": 6.785019397735596, + "learning_rate": 2.4340721029624875e-05, + "loss": 2.7261, + "step": 4952500 + }, + { + "epoch": 1.5397121705029944, + "grad_norm": 9.036152839660645, + "learning_rate": 2.4338130491616763e-05, + "loss": 2.7236, + "step": 4953000 + }, + { + "epoch": 1.5398676027834812, + "grad_norm": 11.045331001281738, + "learning_rate": 2.4335539953608646e-05, + "loss": 2.7779, + "step": 4953500 + }, + { + "epoch": 1.5400230350639683, + "grad_norm": 12.920889854431152, + "learning_rate": 2.433294941560053e-05, + "loss": 2.81, + "step": 4954000 + }, + { + "epoch": 1.5401784673444552, + "grad_norm": 12.143770217895508, + "learning_rate": 2.4330358877592417e-05, + "loss": 2.7232, + "step": 4954500 + }, + { + "epoch": 1.540333899624942, + "grad_norm": 11.058176040649414, + "learning_rate": 2.4327768339584304e-05, + "loss": 2.7913, + "step": 4955000 + }, + { + "epoch": 1.540489331905429, + "grad_norm": 8.710577011108398, + "learning_rate": 2.4325177801576188e-05, + "loss": 2.778, + "step": 4955500 + }, + { + "epoch": 1.5406447641859158, + "grad_norm": 16.872156143188477, + "learning_rate": 2.4322587263568072e-05, + "loss": 2.8484, + "step": 4956000 + }, + { + "epoch": 1.5408001964664026, + "grad_norm": 8.641480445861816, + "learning_rate": 2.431999672555996e-05, + "loss": 2.791, + "step": 4956500 + }, + { + "epoch": 1.5409556287468895, + "grad_norm": 11.766640663146973, + "learning_rate": 2.4317406187551843e-05, + "loss": 2.7202, + "step": 4957000 + }, + { + "epoch": 1.5411110610273764, + "grad_norm": 12.81679916381836, + "learning_rate": 2.431481564954373e-05, + "loss": 2.7877, + "step": 4957500 + }, + { + "epoch": 1.5412664933078633, + "grad_norm": 8.432589530944824, + "learning_rate": 2.4312225111535617e-05, + "loss": 2.7928, + "step": 4958000 + }, + { + "epoch": 1.5414219255883501, + "grad_norm": 28.883764266967773, + "learning_rate": 2.43096345735275e-05, + "loss": 2.7745, + "step": 4958500 + }, + { + "epoch": 1.541577357868837, + "grad_norm": 11.702127456665039, + "learning_rate": 2.4307044035519385e-05, + "loss": 2.7612, + "step": 4959000 + }, + { + "epoch": 1.5417327901493239, + "grad_norm": 9.138216018676758, + "learning_rate": 2.4304453497511272e-05, + "loss": 2.7868, + "step": 4959500 + }, + { + "epoch": 1.5418882224298107, + "grad_norm": 14.706995964050293, + "learning_rate": 2.430186295950316e-05, + "loss": 2.8354, + "step": 4960000 + }, + { + "epoch": 1.5420436547102976, + "grad_norm": 14.480578422546387, + "learning_rate": 2.4299272421495043e-05, + "loss": 2.7586, + "step": 4960500 + }, + { + "epoch": 1.5421990869907845, + "grad_norm": 9.883678436279297, + "learning_rate": 2.4296681883486926e-05, + "loss": 2.7767, + "step": 4961000 + }, + { + "epoch": 1.5423545192712713, + "grad_norm": 7.754658222198486, + "learning_rate": 2.429409134547881e-05, + "loss": 2.8061, + "step": 4961500 + }, + { + "epoch": 1.5425099515517582, + "grad_norm": 7.7805352210998535, + "learning_rate": 2.4291500807470697e-05, + "loss": 2.7959, + "step": 4962000 + }, + { + "epoch": 1.542665383832245, + "grad_norm": 9.285102844238281, + "learning_rate": 2.4288910269462585e-05, + "loss": 2.8093, + "step": 4962500 + }, + { + "epoch": 1.542820816112732, + "grad_norm": 8.390935897827148, + "learning_rate": 2.428631973145447e-05, + "loss": 2.7945, + "step": 4963000 + }, + { + "epoch": 1.5429762483932188, + "grad_norm": 15.977946281433105, + "learning_rate": 2.4283729193446355e-05, + "loss": 2.7662, + "step": 4963500 + }, + { + "epoch": 1.5431316806737057, + "grad_norm": 13.139912605285645, + "learning_rate": 2.428113865543824e-05, + "loss": 2.8039, + "step": 4964000 + }, + { + "epoch": 1.5432871129541925, + "grad_norm": 30.72053337097168, + "learning_rate": 2.4278548117430126e-05, + "loss": 2.7823, + "step": 4964500 + }, + { + "epoch": 1.5434425452346794, + "grad_norm": 8.658971786499023, + "learning_rate": 2.427595757942201e-05, + "loss": 2.7852, + "step": 4965000 + }, + { + "epoch": 1.5435979775151663, + "grad_norm": 9.469697952270508, + "learning_rate": 2.4273367041413897e-05, + "loss": 2.7709, + "step": 4965500 + }, + { + "epoch": 1.5437534097956531, + "grad_norm": 10.266731262207031, + "learning_rate": 2.427077650340578e-05, + "loss": 2.7776, + "step": 4966000 + }, + { + "epoch": 1.54390884207614, + "grad_norm": 17.100074768066406, + "learning_rate": 2.4268185965397665e-05, + "loss": 2.7685, + "step": 4966500 + }, + { + "epoch": 1.5440642743566269, + "grad_norm": 8.764692306518555, + "learning_rate": 2.4265595427389552e-05, + "loss": 2.7727, + "step": 4967000 + }, + { + "epoch": 1.5442197066371137, + "grad_norm": 17.278738021850586, + "learning_rate": 2.426300488938144e-05, + "loss": 2.7718, + "step": 4967500 + }, + { + "epoch": 1.5443751389176006, + "grad_norm": 11.429961204528809, + "learning_rate": 2.4260414351373323e-05, + "loss": 2.7602, + "step": 4968000 + }, + { + "epoch": 1.5445305711980875, + "grad_norm": 10.584446907043457, + "learning_rate": 2.4257823813365207e-05, + "loss": 2.7766, + "step": 4968500 + }, + { + "epoch": 1.5446860034785743, + "grad_norm": 24.51889991760254, + "learning_rate": 2.4255233275357094e-05, + "loss": 2.8034, + "step": 4969000 + }, + { + "epoch": 1.5448414357590612, + "grad_norm": 8.399270057678223, + "learning_rate": 2.425264273734898e-05, + "loss": 2.7717, + "step": 4969500 + }, + { + "epoch": 1.544996868039548, + "grad_norm": 8.247196197509766, + "learning_rate": 2.4250052199340865e-05, + "loss": 2.7549, + "step": 4970000 + }, + { + "epoch": 1.545152300320035, + "grad_norm": 9.456645965576172, + "learning_rate": 2.424746166133275e-05, + "loss": 2.7536, + "step": 4970500 + }, + { + "epoch": 1.5453077326005218, + "grad_norm": 9.277894020080566, + "learning_rate": 2.4244871123324636e-05, + "loss": 2.7688, + "step": 4971000 + }, + { + "epoch": 1.5454631648810087, + "grad_norm": 10.324369430541992, + "learning_rate": 2.424228058531652e-05, + "loss": 2.7577, + "step": 4971500 + }, + { + "epoch": 1.5456185971614957, + "grad_norm": 10.543966293334961, + "learning_rate": 2.4239690047308407e-05, + "loss": 2.7904, + "step": 4972000 + }, + { + "epoch": 1.5457740294419826, + "grad_norm": 10.42240047454834, + "learning_rate": 2.4237099509300294e-05, + "loss": 2.7548, + "step": 4972500 + }, + { + "epoch": 1.5459294617224695, + "grad_norm": 9.190238952636719, + "learning_rate": 2.4234508971292178e-05, + "loss": 2.7592, + "step": 4973000 + }, + { + "epoch": 1.5460848940029563, + "grad_norm": 10.26937198638916, + "learning_rate": 2.423191843328406e-05, + "loss": 2.8028, + "step": 4973500 + }, + { + "epoch": 1.5462403262834432, + "grad_norm": 9.307356834411621, + "learning_rate": 2.4229327895275945e-05, + "loss": 2.8027, + "step": 4974000 + }, + { + "epoch": 1.54639575856393, + "grad_norm": 8.90548324584961, + "learning_rate": 2.4226737357267836e-05, + "loss": 2.7846, + "step": 4974500 + }, + { + "epoch": 1.546551190844417, + "grad_norm": 11.534445762634277, + "learning_rate": 2.422414681925972e-05, + "loss": 2.7668, + "step": 4975000 + }, + { + "epoch": 1.5467066231249038, + "grad_norm": 21.9250545501709, + "learning_rate": 2.4221556281251603e-05, + "loss": 2.77, + "step": 4975500 + }, + { + "epoch": 1.5468620554053907, + "grad_norm": 11.467473030090332, + "learning_rate": 2.421896574324349e-05, + "loss": 2.7835, + "step": 4976000 + }, + { + "epoch": 1.5470174876858775, + "grad_norm": 13.60908031463623, + "learning_rate": 2.4216375205235374e-05, + "loss": 2.8101, + "step": 4976500 + }, + { + "epoch": 1.5471729199663644, + "grad_norm": 11.233394622802734, + "learning_rate": 2.421378466722726e-05, + "loss": 2.7915, + "step": 4977000 + }, + { + "epoch": 1.5473283522468513, + "grad_norm": 8.332118034362793, + "learning_rate": 2.4211194129219145e-05, + "loss": 2.8374, + "step": 4977500 + }, + { + "epoch": 1.5474837845273384, + "grad_norm": 12.434977531433105, + "learning_rate": 2.4208603591211032e-05, + "loss": 2.7824, + "step": 4978000 + }, + { + "epoch": 1.5476392168078252, + "grad_norm": 8.765534400939941, + "learning_rate": 2.4206013053202916e-05, + "loss": 2.7826, + "step": 4978500 + }, + { + "epoch": 1.547794649088312, + "grad_norm": 8.479438781738281, + "learning_rate": 2.42034225151948e-05, + "loss": 2.8005, + "step": 4979000 + }, + { + "epoch": 1.547950081368799, + "grad_norm": 17.83700180053711, + "learning_rate": 2.4200831977186687e-05, + "loss": 2.7595, + "step": 4979500 + }, + { + "epoch": 1.5481055136492858, + "grad_norm": 8.552816390991211, + "learning_rate": 2.4198241439178574e-05, + "loss": 2.7595, + "step": 4980000 + }, + { + "epoch": 1.5482609459297727, + "grad_norm": 8.282855987548828, + "learning_rate": 2.4195650901170458e-05, + "loss": 2.7867, + "step": 4980500 + }, + { + "epoch": 1.5484163782102596, + "grad_norm": 6.861931324005127, + "learning_rate": 2.419306036316234e-05, + "loss": 2.7693, + "step": 4981000 + }, + { + "epoch": 1.5485718104907464, + "grad_norm": 9.192740440368652, + "learning_rate": 2.419046982515423e-05, + "loss": 2.7537, + "step": 4981500 + }, + { + "epoch": 1.5487272427712333, + "grad_norm": 34.1582145690918, + "learning_rate": 2.4187879287146116e-05, + "loss": 2.7786, + "step": 4982000 + }, + { + "epoch": 1.5488826750517202, + "grad_norm": 10.805374145507812, + "learning_rate": 2.4185288749138e-05, + "loss": 2.742, + "step": 4982500 + }, + { + "epoch": 1.549038107332207, + "grad_norm": 15.174613952636719, + "learning_rate": 2.4182698211129883e-05, + "loss": 2.7733, + "step": 4983000 + }, + { + "epoch": 1.549193539612694, + "grad_norm": 7.433111190795898, + "learning_rate": 2.418010767312177e-05, + "loss": 2.7891, + "step": 4983500 + }, + { + "epoch": 1.5493489718931808, + "grad_norm": 9.31905460357666, + "learning_rate": 2.4177517135113654e-05, + "loss": 2.7469, + "step": 4984000 + }, + { + "epoch": 1.5495044041736676, + "grad_norm": 9.918177604675293, + "learning_rate": 2.417492659710554e-05, + "loss": 2.729, + "step": 4984500 + }, + { + "epoch": 1.5496598364541545, + "grad_norm": 9.982852935791016, + "learning_rate": 2.417233605909743e-05, + "loss": 2.7821, + "step": 4985000 + }, + { + "epoch": 1.5498152687346414, + "grad_norm": 9.908548355102539, + "learning_rate": 2.4169745521089312e-05, + "loss": 2.7694, + "step": 4985500 + }, + { + "epoch": 1.5499707010151282, + "grad_norm": 8.718083381652832, + "learning_rate": 2.4167154983081196e-05, + "loss": 2.7244, + "step": 4986000 + }, + { + "epoch": 1.550126133295615, + "grad_norm": 10.624916076660156, + "learning_rate": 2.4164564445073083e-05, + "loss": 2.7529, + "step": 4986500 + }, + { + "epoch": 1.550281565576102, + "grad_norm": 10.297810554504395, + "learning_rate": 2.416197390706497e-05, + "loss": 2.7639, + "step": 4987000 + }, + { + "epoch": 1.5504369978565888, + "grad_norm": 9.091371536254883, + "learning_rate": 2.4159383369056854e-05, + "loss": 2.8203, + "step": 4987500 + }, + { + "epoch": 1.5505924301370757, + "grad_norm": 8.105730056762695, + "learning_rate": 2.4156792831048738e-05, + "loss": 2.7499, + "step": 4988000 + }, + { + "epoch": 1.5507478624175626, + "grad_norm": 9.796317100524902, + "learning_rate": 2.415420229304062e-05, + "loss": 2.7784, + "step": 4988500 + }, + { + "epoch": 1.5509032946980494, + "grad_norm": 6.699175834655762, + "learning_rate": 2.415161175503251e-05, + "loss": 2.7658, + "step": 4989000 + }, + { + "epoch": 1.5510587269785363, + "grad_norm": 9.560373306274414, + "learning_rate": 2.4149021217024396e-05, + "loss": 2.7686, + "step": 4989500 + }, + { + "epoch": 1.5512141592590232, + "grad_norm": 10.442084312438965, + "learning_rate": 2.414643067901628e-05, + "loss": 2.8029, + "step": 4990000 + }, + { + "epoch": 1.55136959153951, + "grad_norm": 11.138956069946289, + "learning_rate": 2.4143840141008167e-05, + "loss": 2.7781, + "step": 4990500 + }, + { + "epoch": 1.551525023819997, + "grad_norm": 18.018537521362305, + "learning_rate": 2.414124960300005e-05, + "loss": 2.7719, + "step": 4991000 + }, + { + "epoch": 1.5516804561004838, + "grad_norm": 13.052005767822266, + "learning_rate": 2.4138659064991938e-05, + "loss": 2.8143, + "step": 4991500 + }, + { + "epoch": 1.5518358883809706, + "grad_norm": 42.97817611694336, + "learning_rate": 2.413606852698382e-05, + "loss": 2.7877, + "step": 4992000 + }, + { + "epoch": 1.5519913206614575, + "grad_norm": 9.035807609558105, + "learning_rate": 2.413347798897571e-05, + "loss": 2.7683, + "step": 4992500 + }, + { + "epoch": 1.5521467529419444, + "grad_norm": 10.058037757873535, + "learning_rate": 2.4130887450967592e-05, + "loss": 2.7432, + "step": 4993000 + }, + { + "epoch": 1.5523021852224312, + "grad_norm": 8.130960464477539, + "learning_rate": 2.4128296912959476e-05, + "loss": 2.7854, + "step": 4993500 + }, + { + "epoch": 1.552457617502918, + "grad_norm": 10.253935813903809, + "learning_rate": 2.4125706374951363e-05, + "loss": 2.7313, + "step": 4994000 + }, + { + "epoch": 1.552613049783405, + "grad_norm": 9.132733345031738, + "learning_rate": 2.412311583694325e-05, + "loss": 2.7639, + "step": 4994500 + }, + { + "epoch": 1.5527684820638918, + "grad_norm": 10.970513343811035, + "learning_rate": 2.4120525298935134e-05, + "loss": 2.7371, + "step": 4995000 + }, + { + "epoch": 1.5529239143443787, + "grad_norm": 7.888111114501953, + "learning_rate": 2.4117934760927018e-05, + "loss": 2.7799, + "step": 4995500 + }, + { + "epoch": 1.5530793466248658, + "grad_norm": 10.8418607711792, + "learning_rate": 2.4115344222918905e-05, + "loss": 2.7819, + "step": 4996000 + }, + { + "epoch": 1.5532347789053527, + "grad_norm": 8.5740327835083, + "learning_rate": 2.4112753684910792e-05, + "loss": 2.7711, + "step": 4996500 + }, + { + "epoch": 1.5533902111858395, + "grad_norm": 10.04001522064209, + "learning_rate": 2.4110163146902676e-05, + "loss": 2.7255, + "step": 4997000 + }, + { + "epoch": 1.5535456434663264, + "grad_norm": 9.691691398620605, + "learning_rate": 2.410757260889456e-05, + "loss": 2.7479, + "step": 4997500 + }, + { + "epoch": 1.5537010757468133, + "grad_norm": 11.272490501403809, + "learning_rate": 2.4104982070886447e-05, + "loss": 2.7557, + "step": 4998000 + }, + { + "epoch": 1.5538565080273001, + "grad_norm": 9.287012100219727, + "learning_rate": 2.410239153287833e-05, + "loss": 2.7583, + "step": 4998500 + }, + { + "epoch": 1.554011940307787, + "grad_norm": 8.74532413482666, + "learning_rate": 2.4099800994870218e-05, + "loss": 2.7938, + "step": 4999000 + }, + { + "epoch": 1.5541673725882739, + "grad_norm": 14.931551933288574, + "learning_rate": 2.4097210456862105e-05, + "loss": 2.75, + "step": 4999500 + }, + { + "epoch": 1.5543228048687607, + "grad_norm": 8.511895179748535, + "learning_rate": 2.409461991885399e-05, + "loss": 2.7942, + "step": 5000000 + }, + { + "epoch": 1.5544782371492476, + "grad_norm": 10.678973197937012, + "learning_rate": 2.4092029380845873e-05, + "loss": 2.7916, + "step": 5000500 + }, + { + "epoch": 1.5546336694297345, + "grad_norm": 8.095836639404297, + "learning_rate": 2.408943884283776e-05, + "loss": 2.7384, + "step": 5001000 + }, + { + "epoch": 1.5547891017102213, + "grad_norm": 8.678053855895996, + "learning_rate": 2.4086848304829647e-05, + "loss": 2.7685, + "step": 5001500 + }, + { + "epoch": 1.5549445339907084, + "grad_norm": 11.933993339538574, + "learning_rate": 2.408425776682153e-05, + "loss": 2.8016, + "step": 5002000 + }, + { + "epoch": 1.5550999662711953, + "grad_norm": 8.554154396057129, + "learning_rate": 2.4081667228813415e-05, + "loss": 2.7879, + "step": 5002500 + }, + { + "epoch": 1.5552553985516822, + "grad_norm": 11.102785110473633, + "learning_rate": 2.40790766908053e-05, + "loss": 2.8108, + "step": 5003000 + }, + { + "epoch": 1.555410830832169, + "grad_norm": 9.706934928894043, + "learning_rate": 2.4076486152797185e-05, + "loss": 2.8015, + "step": 5003500 + }, + { + "epoch": 1.555566263112656, + "grad_norm": 9.095174789428711, + "learning_rate": 2.4073895614789073e-05, + "loss": 2.7881, + "step": 5004000 + }, + { + "epoch": 1.5557216953931428, + "grad_norm": 49.65503692626953, + "learning_rate": 2.4071305076780956e-05, + "loss": 2.7539, + "step": 5004500 + }, + { + "epoch": 1.5558771276736296, + "grad_norm": 12.077136039733887, + "learning_rate": 2.4068714538772844e-05, + "loss": 2.8, + "step": 5005000 + }, + { + "epoch": 1.5560325599541165, + "grad_norm": 6.521296501159668, + "learning_rate": 2.4066124000764727e-05, + "loss": 2.7863, + "step": 5005500 + }, + { + "epoch": 1.5561879922346034, + "grad_norm": 8.629199028015137, + "learning_rate": 2.4063533462756614e-05, + "loss": 2.7455, + "step": 5006000 + }, + { + "epoch": 1.5563434245150902, + "grad_norm": 8.90859603881836, + "learning_rate": 2.4060942924748498e-05, + "loss": 2.7255, + "step": 5006500 + }, + { + "epoch": 1.556498856795577, + "grad_norm": 10.544561386108398, + "learning_rate": 2.4058352386740385e-05, + "loss": 2.7665, + "step": 5007000 + }, + { + "epoch": 1.556654289076064, + "grad_norm": 7.792097091674805, + "learning_rate": 2.405576184873227e-05, + "loss": 2.7785, + "step": 5007500 + }, + { + "epoch": 1.5568097213565508, + "grad_norm": 7.706377029418945, + "learning_rate": 2.4053171310724153e-05, + "loss": 2.7498, + "step": 5008000 + }, + { + "epoch": 1.5569651536370377, + "grad_norm": 18.528377532958984, + "learning_rate": 2.405058077271604e-05, + "loss": 2.8039, + "step": 5008500 + }, + { + "epoch": 1.5571205859175246, + "grad_norm": 8.875295639038086, + "learning_rate": 2.4047990234707927e-05, + "loss": 2.7516, + "step": 5009000 + }, + { + "epoch": 1.5572760181980114, + "grad_norm": 11.668190002441406, + "learning_rate": 2.404539969669981e-05, + "loss": 2.74, + "step": 5009500 + }, + { + "epoch": 1.5574314504784983, + "grad_norm": 9.057136535644531, + "learning_rate": 2.4042809158691695e-05, + "loss": 2.7869, + "step": 5010000 + }, + { + "epoch": 1.5575868827589852, + "grad_norm": 8.8588285446167, + "learning_rate": 2.4040218620683582e-05, + "loss": 2.8139, + "step": 5010500 + }, + { + "epoch": 1.557742315039472, + "grad_norm": 8.658839225769043, + "learning_rate": 2.403762808267547e-05, + "loss": 2.7834, + "step": 5011000 + }, + { + "epoch": 1.557897747319959, + "grad_norm": 10.572925567626953, + "learning_rate": 2.4035037544667353e-05, + "loss": 2.7868, + "step": 5011500 + }, + { + "epoch": 1.5580531796004458, + "grad_norm": 6.7250871658325195, + "learning_rate": 2.403244700665924e-05, + "loss": 2.801, + "step": 5012000 + }, + { + "epoch": 1.5582086118809326, + "grad_norm": 9.818016052246094, + "learning_rate": 2.4029856468651124e-05, + "loss": 2.7297, + "step": 5012500 + }, + { + "epoch": 1.5583640441614195, + "grad_norm": 12.477269172668457, + "learning_rate": 2.4027265930643007e-05, + "loss": 2.7416, + "step": 5013000 + }, + { + "epoch": 1.5585194764419064, + "grad_norm": 18.74561309814453, + "learning_rate": 2.4024675392634895e-05, + "loss": 2.7902, + "step": 5013500 + }, + { + "epoch": 1.5586749087223932, + "grad_norm": 16.175037384033203, + "learning_rate": 2.4022084854626782e-05, + "loss": 2.8035, + "step": 5014000 + }, + { + "epoch": 1.55883034100288, + "grad_norm": 8.13518238067627, + "learning_rate": 2.4019494316618666e-05, + "loss": 2.7665, + "step": 5014500 + }, + { + "epoch": 1.558985773283367, + "grad_norm": 15.758527755737305, + "learning_rate": 2.401690377861055e-05, + "loss": 2.8071, + "step": 5015000 + }, + { + "epoch": 1.5591412055638538, + "grad_norm": 10.276273727416992, + "learning_rate": 2.4014313240602433e-05, + "loss": 2.7875, + "step": 5015500 + }, + { + "epoch": 1.5592966378443407, + "grad_norm": 8.159103393554688, + "learning_rate": 2.4011722702594324e-05, + "loss": 2.816, + "step": 5016000 + }, + { + "epoch": 1.5594520701248276, + "grad_norm": 8.957876205444336, + "learning_rate": 2.4009132164586207e-05, + "loss": 2.7617, + "step": 5016500 + }, + { + "epoch": 1.5596075024053144, + "grad_norm": 10.422518730163574, + "learning_rate": 2.400654162657809e-05, + "loss": 2.7569, + "step": 5017000 + }, + { + "epoch": 1.5597629346858013, + "grad_norm": 13.810672760009766, + "learning_rate": 2.4003951088569978e-05, + "loss": 2.7259, + "step": 5017500 + }, + { + "epoch": 1.5599183669662882, + "grad_norm": 9.502352714538574, + "learning_rate": 2.4001360550561862e-05, + "loss": 2.7792, + "step": 5018000 + }, + { + "epoch": 1.560073799246775, + "grad_norm": 10.353432655334473, + "learning_rate": 2.399877001255375e-05, + "loss": 2.7731, + "step": 5018500 + }, + { + "epoch": 1.560229231527262, + "grad_norm": 12.842019081115723, + "learning_rate": 2.3996179474545633e-05, + "loss": 2.7339, + "step": 5019000 + }, + { + "epoch": 1.5603846638077488, + "grad_norm": 9.421480178833008, + "learning_rate": 2.399358893653752e-05, + "loss": 2.7472, + "step": 5019500 + }, + { + "epoch": 1.5605400960882359, + "grad_norm": 11.60014533996582, + "learning_rate": 2.3990998398529404e-05, + "loss": 2.7473, + "step": 5020000 + }, + { + "epoch": 1.5606955283687227, + "grad_norm": 16.029800415039062, + "learning_rate": 2.3988407860521288e-05, + "loss": 2.7941, + "step": 5020500 + }, + { + "epoch": 1.5608509606492096, + "grad_norm": 10.976276397705078, + "learning_rate": 2.3985817322513178e-05, + "loss": 2.7571, + "step": 5021000 + }, + { + "epoch": 1.5610063929296965, + "grad_norm": 12.966867446899414, + "learning_rate": 2.3983226784505062e-05, + "loss": 2.7898, + "step": 5021500 + }, + { + "epoch": 1.5611618252101833, + "grad_norm": 8.383177757263184, + "learning_rate": 2.3980636246496946e-05, + "loss": 2.7743, + "step": 5022000 + }, + { + "epoch": 1.5613172574906702, + "grad_norm": 8.820206642150879, + "learning_rate": 2.397804570848883e-05, + "loss": 2.8046, + "step": 5022500 + }, + { + "epoch": 1.561472689771157, + "grad_norm": 16.839235305786133, + "learning_rate": 2.3975455170480717e-05, + "loss": 2.7556, + "step": 5023000 + }, + { + "epoch": 1.561628122051644, + "grad_norm": 8.48927116394043, + "learning_rate": 2.3972864632472604e-05, + "loss": 2.7284, + "step": 5023500 + }, + { + "epoch": 1.5617835543321308, + "grad_norm": 10.045921325683594, + "learning_rate": 2.3970274094464488e-05, + "loss": 2.74, + "step": 5024000 + }, + { + "epoch": 1.5619389866126177, + "grad_norm": 10.499956130981445, + "learning_rate": 2.396768355645637e-05, + "loss": 2.7836, + "step": 5024500 + }, + { + "epoch": 1.5620944188931045, + "grad_norm": 9.293780326843262, + "learning_rate": 2.396509301844826e-05, + "loss": 2.7909, + "step": 5025000 + }, + { + "epoch": 1.5622498511735914, + "grad_norm": 16.192766189575195, + "learning_rate": 2.3962502480440142e-05, + "loss": 2.7487, + "step": 5025500 + }, + { + "epoch": 1.5624052834540785, + "grad_norm": 14.942630767822266, + "learning_rate": 2.395991194243203e-05, + "loss": 2.7313, + "step": 5026000 + }, + { + "epoch": 1.5625607157345653, + "grad_norm": 12.711578369140625, + "learning_rate": 2.3957321404423917e-05, + "loss": 2.7901, + "step": 5026500 + }, + { + "epoch": 1.5627161480150522, + "grad_norm": 10.107441902160645, + "learning_rate": 2.39547308664158e-05, + "loss": 2.8196, + "step": 5027000 + }, + { + "epoch": 1.562871580295539, + "grad_norm": 8.431321144104004, + "learning_rate": 2.3952140328407684e-05, + "loss": 2.7261, + "step": 5027500 + }, + { + "epoch": 1.563027012576026, + "grad_norm": 18.6417293548584, + "learning_rate": 2.394954979039957e-05, + "loss": 2.761, + "step": 5028000 + }, + { + "epoch": 1.5631824448565128, + "grad_norm": 8.27269172668457, + "learning_rate": 2.394695925239146e-05, + "loss": 2.7561, + "step": 5028500 + }, + { + "epoch": 1.5633378771369997, + "grad_norm": 10.10798168182373, + "learning_rate": 2.3944368714383342e-05, + "loss": 2.7384, + "step": 5029000 + }, + { + "epoch": 1.5634933094174865, + "grad_norm": 8.169349670410156, + "learning_rate": 2.3941778176375226e-05, + "loss": 2.7583, + "step": 5029500 + }, + { + "epoch": 1.5636487416979734, + "grad_norm": 9.154059410095215, + "learning_rate": 2.3939187638367113e-05, + "loss": 2.766, + "step": 5030000 + }, + { + "epoch": 1.5638041739784603, + "grad_norm": 8.481191635131836, + "learning_rate": 2.3936597100358997e-05, + "loss": 2.7444, + "step": 5030500 + }, + { + "epoch": 1.5639596062589471, + "grad_norm": 11.26380729675293, + "learning_rate": 2.3934006562350884e-05, + "loss": 2.7578, + "step": 5031000 + }, + { + "epoch": 1.564115038539434, + "grad_norm": 14.7928466796875, + "learning_rate": 2.3931416024342768e-05, + "loss": 2.7479, + "step": 5031500 + }, + { + "epoch": 1.5642704708199209, + "grad_norm": 10.182064056396484, + "learning_rate": 2.3928825486334655e-05, + "loss": 2.7539, + "step": 5032000 + }, + { + "epoch": 1.5644259031004077, + "grad_norm": 9.550956726074219, + "learning_rate": 2.392623494832654e-05, + "loss": 2.7647, + "step": 5032500 + }, + { + "epoch": 1.5645813353808946, + "grad_norm": 15.534473419189453, + "learning_rate": 2.3923644410318426e-05, + "loss": 2.7546, + "step": 5033000 + }, + { + "epoch": 1.5647367676613815, + "grad_norm": 17.987239837646484, + "learning_rate": 2.392105387231031e-05, + "loss": 2.777, + "step": 5033500 + }, + { + "epoch": 1.5648921999418683, + "grad_norm": 8.514985084533691, + "learning_rate": 2.3918463334302197e-05, + "loss": 2.7835, + "step": 5034000 + }, + { + "epoch": 1.5650476322223552, + "grad_norm": 15.319284439086914, + "learning_rate": 2.391587279629408e-05, + "loss": 2.7608, + "step": 5034500 + }, + { + "epoch": 1.565203064502842, + "grad_norm": 12.937529563903809, + "learning_rate": 2.3913282258285964e-05, + "loss": 2.7651, + "step": 5035000 + }, + { + "epoch": 1.565358496783329, + "grad_norm": 8.4225435256958, + "learning_rate": 2.391069172027785e-05, + "loss": 2.7299, + "step": 5035500 + }, + { + "epoch": 1.5655139290638158, + "grad_norm": 9.326587677001953, + "learning_rate": 2.390810118226974e-05, + "loss": 2.7904, + "step": 5036000 + }, + { + "epoch": 1.5656693613443027, + "grad_norm": 10.8521146774292, + "learning_rate": 2.3905510644261622e-05, + "loss": 2.7927, + "step": 5036500 + }, + { + "epoch": 1.5658247936247895, + "grad_norm": 10.071260452270508, + "learning_rate": 2.3902920106253506e-05, + "loss": 2.7954, + "step": 5037000 + }, + { + "epoch": 1.5659802259052764, + "grad_norm": 12.726395606994629, + "learning_rate": 2.3900329568245393e-05, + "loss": 2.7902, + "step": 5037500 + }, + { + "epoch": 1.5661356581857633, + "grad_norm": 6.987044334411621, + "learning_rate": 2.389773903023728e-05, + "loss": 2.7568, + "step": 5038000 + }, + { + "epoch": 1.5662910904662501, + "grad_norm": 9.003243446350098, + "learning_rate": 2.3895148492229164e-05, + "loss": 2.7189, + "step": 5038500 + }, + { + "epoch": 1.566446522746737, + "grad_norm": 11.971965789794922, + "learning_rate": 2.389255795422105e-05, + "loss": 2.8013, + "step": 5039000 + }, + { + "epoch": 1.5666019550272239, + "grad_norm": 8.859938621520996, + "learning_rate": 2.3889967416212935e-05, + "loss": 2.7902, + "step": 5039500 + }, + { + "epoch": 1.5667573873077107, + "grad_norm": 9.982629776000977, + "learning_rate": 2.388737687820482e-05, + "loss": 2.7614, + "step": 5040000 + }, + { + "epoch": 1.5669128195881976, + "grad_norm": 26.981151580810547, + "learning_rate": 2.3884786340196706e-05, + "loss": 2.7187, + "step": 5040500 + }, + { + "epoch": 1.5670682518686845, + "grad_norm": 9.77907943725586, + "learning_rate": 2.3882195802188593e-05, + "loss": 2.7611, + "step": 5041000 + }, + { + "epoch": 1.5672236841491713, + "grad_norm": 10.472597122192383, + "learning_rate": 2.3879605264180477e-05, + "loss": 2.7863, + "step": 5041500 + }, + { + "epoch": 1.5673791164296582, + "grad_norm": 9.984049797058105, + "learning_rate": 2.387701472617236e-05, + "loss": 2.7996, + "step": 5042000 + }, + { + "epoch": 1.567534548710145, + "grad_norm": 10.322897911071777, + "learning_rate": 2.3874424188164244e-05, + "loss": 2.7539, + "step": 5042500 + }, + { + "epoch": 1.567689980990632, + "grad_norm": 8.210405349731445, + "learning_rate": 2.3871833650156135e-05, + "loss": 2.7548, + "step": 5043000 + }, + { + "epoch": 1.5678454132711188, + "grad_norm": 41.03724670410156, + "learning_rate": 2.386924311214802e-05, + "loss": 2.7617, + "step": 5043500 + }, + { + "epoch": 1.568000845551606, + "grad_norm": 7.129809856414795, + "learning_rate": 2.3866652574139903e-05, + "loss": 2.7748, + "step": 5044000 + }, + { + "epoch": 1.5681562778320928, + "grad_norm": 16.55553436279297, + "learning_rate": 2.386406203613179e-05, + "loss": 2.8053, + "step": 5044500 + }, + { + "epoch": 1.5683117101125796, + "grad_norm": 9.386181831359863, + "learning_rate": 2.3861471498123673e-05, + "loss": 2.74, + "step": 5045000 + }, + { + "epoch": 1.5684671423930665, + "grad_norm": 12.855425834655762, + "learning_rate": 2.385888096011556e-05, + "loss": 2.7913, + "step": 5045500 + }, + { + "epoch": 1.5686225746735534, + "grad_norm": 9.243890762329102, + "learning_rate": 2.3856290422107444e-05, + "loss": 2.7542, + "step": 5046000 + }, + { + "epoch": 1.5687780069540402, + "grad_norm": 10.996024131774902, + "learning_rate": 2.385369988409933e-05, + "loss": 2.7619, + "step": 5046500 + }, + { + "epoch": 1.568933439234527, + "grad_norm": 10.07788372039795, + "learning_rate": 2.3851109346091215e-05, + "loss": 2.7442, + "step": 5047000 + }, + { + "epoch": 1.569088871515014, + "grad_norm": 17.06034278869629, + "learning_rate": 2.38485188080831e-05, + "loss": 2.7295, + "step": 5047500 + }, + { + "epoch": 1.5692443037955008, + "grad_norm": 16.431671142578125, + "learning_rate": 2.384592827007499e-05, + "loss": 2.7425, + "step": 5048000 + }, + { + "epoch": 1.5693997360759877, + "grad_norm": 11.267928123474121, + "learning_rate": 2.3843337732066873e-05, + "loss": 2.8082, + "step": 5048500 + }, + { + "epoch": 1.5695551683564746, + "grad_norm": 9.486320495605469, + "learning_rate": 2.3840747194058757e-05, + "loss": 2.7702, + "step": 5049000 + }, + { + "epoch": 1.5697106006369614, + "grad_norm": 15.675990104675293, + "learning_rate": 2.383815665605064e-05, + "loss": 2.7585, + "step": 5049500 + }, + { + "epoch": 1.5698660329174483, + "grad_norm": 10.371773719787598, + "learning_rate": 2.3835566118042528e-05, + "loss": 2.7482, + "step": 5050000 + }, + { + "epoch": 1.5700214651979354, + "grad_norm": 7.660102844238281, + "learning_rate": 2.3832975580034415e-05, + "loss": 2.8038, + "step": 5050500 + }, + { + "epoch": 1.5701768974784223, + "grad_norm": 11.227189064025879, + "learning_rate": 2.38303850420263e-05, + "loss": 2.7821, + "step": 5051000 + }, + { + "epoch": 1.5703323297589091, + "grad_norm": 7.487915515899658, + "learning_rate": 2.3827794504018186e-05, + "loss": 2.7711, + "step": 5051500 + }, + { + "epoch": 1.570487762039396, + "grad_norm": 8.585990905761719, + "learning_rate": 2.382520396601007e-05, + "loss": 2.7905, + "step": 5052000 + }, + { + "epoch": 1.5706431943198829, + "grad_norm": 9.463282585144043, + "learning_rate": 2.3822613428001954e-05, + "loss": 2.7882, + "step": 5052500 + }, + { + "epoch": 1.5707986266003697, + "grad_norm": 7.920184135437012, + "learning_rate": 2.382002288999384e-05, + "loss": 2.7814, + "step": 5053000 + }, + { + "epoch": 1.5709540588808566, + "grad_norm": 6.346567630767822, + "learning_rate": 2.3817432351985728e-05, + "loss": 2.7585, + "step": 5053500 + }, + { + "epoch": 1.5711094911613435, + "grad_norm": 11.156027793884277, + "learning_rate": 2.3814841813977612e-05, + "loss": 2.8099, + "step": 5054000 + }, + { + "epoch": 1.5712649234418303, + "grad_norm": 11.2691011428833, + "learning_rate": 2.3812251275969496e-05, + "loss": 2.7461, + "step": 5054500 + }, + { + "epoch": 1.5714203557223172, + "grad_norm": 57.2635612487793, + "learning_rate": 2.3809660737961383e-05, + "loss": 2.79, + "step": 5055000 + }, + { + "epoch": 1.571575788002804, + "grad_norm": 11.389161109924316, + "learning_rate": 2.380707019995327e-05, + "loss": 2.7936, + "step": 5055500 + }, + { + "epoch": 1.571731220283291, + "grad_norm": 16.933177947998047, + "learning_rate": 2.3804479661945154e-05, + "loss": 2.7975, + "step": 5056000 + }, + { + "epoch": 1.5718866525637778, + "grad_norm": 6.702452182769775, + "learning_rate": 2.3801889123937037e-05, + "loss": 2.7713, + "step": 5056500 + }, + { + "epoch": 1.5720420848442647, + "grad_norm": 8.99061107635498, + "learning_rate": 2.3799298585928925e-05, + "loss": 2.7663, + "step": 5057000 + }, + { + "epoch": 1.5721975171247515, + "grad_norm": 9.841646194458008, + "learning_rate": 2.3796708047920808e-05, + "loss": 2.772, + "step": 5057500 + }, + { + "epoch": 1.5723529494052384, + "grad_norm": 10.699134826660156, + "learning_rate": 2.3794117509912695e-05, + "loss": 2.7475, + "step": 5058000 + }, + { + "epoch": 1.5725083816857253, + "grad_norm": 10.208667755126953, + "learning_rate": 2.379152697190458e-05, + "loss": 2.7626, + "step": 5058500 + }, + { + "epoch": 1.5726638139662121, + "grad_norm": 8.186358451843262, + "learning_rate": 2.3788936433896466e-05, + "loss": 2.7615, + "step": 5059000 + }, + { + "epoch": 1.572819246246699, + "grad_norm": 7.460376739501953, + "learning_rate": 2.378634589588835e-05, + "loss": 2.7649, + "step": 5059500 + }, + { + "epoch": 1.5729746785271859, + "grad_norm": 9.594375610351562, + "learning_rate": 2.3783755357880237e-05, + "loss": 2.7345, + "step": 5060000 + }, + { + "epoch": 1.5731301108076727, + "grad_norm": 8.852680206298828, + "learning_rate": 2.3781164819872124e-05, + "loss": 2.7535, + "step": 5060500 + }, + { + "epoch": 1.5732855430881596, + "grad_norm": 9.963186264038086, + "learning_rate": 2.3778574281864008e-05, + "loss": 2.7628, + "step": 5061000 + }, + { + "epoch": 1.5734409753686465, + "grad_norm": 10.023161888122559, + "learning_rate": 2.3775983743855892e-05, + "loss": 2.7851, + "step": 5061500 + }, + { + "epoch": 1.5735964076491333, + "grad_norm": 8.071046829223633, + "learning_rate": 2.3773393205847776e-05, + "loss": 2.7944, + "step": 5062000 + }, + { + "epoch": 1.5737518399296202, + "grad_norm": 43.099849700927734, + "learning_rate": 2.3770802667839663e-05, + "loss": 2.7545, + "step": 5062500 + }, + { + "epoch": 1.573907272210107, + "grad_norm": 9.352400779724121, + "learning_rate": 2.376821212983155e-05, + "loss": 2.753, + "step": 5063000 + }, + { + "epoch": 1.574062704490594, + "grad_norm": 27.37605857849121, + "learning_rate": 2.3765621591823434e-05, + "loss": 2.6997, + "step": 5063500 + }, + { + "epoch": 1.5742181367710808, + "grad_norm": 9.815382957458496, + "learning_rate": 2.3763031053815318e-05, + "loss": 2.7446, + "step": 5064000 + }, + { + "epoch": 1.5743735690515677, + "grad_norm": 14.912002563476562, + "learning_rate": 2.3760440515807205e-05, + "loss": 2.7927, + "step": 5064500 + }, + { + "epoch": 1.5745290013320545, + "grad_norm": 7.419000625610352, + "learning_rate": 2.3757849977799092e-05, + "loss": 2.7755, + "step": 5065000 + }, + { + "epoch": 1.5746844336125414, + "grad_norm": 8.372291564941406, + "learning_rate": 2.3755259439790976e-05, + "loss": 2.7561, + "step": 5065500 + }, + { + "epoch": 1.5748398658930283, + "grad_norm": 10.86640739440918, + "learning_rate": 2.3752668901782863e-05, + "loss": 2.7737, + "step": 5066000 + }, + { + "epoch": 1.5749952981735151, + "grad_norm": 17.62009048461914, + "learning_rate": 2.3750078363774747e-05, + "loss": 2.7928, + "step": 5066500 + }, + { + "epoch": 1.575150730454002, + "grad_norm": 7.940601348876953, + "learning_rate": 2.374748782576663e-05, + "loss": 2.8143, + "step": 5067000 + }, + { + "epoch": 1.5753061627344889, + "grad_norm": 11.259702682495117, + "learning_rate": 2.3744897287758517e-05, + "loss": 2.7949, + "step": 5067500 + }, + { + "epoch": 1.5754615950149757, + "grad_norm": 26.416126251220703, + "learning_rate": 2.3742306749750405e-05, + "loss": 2.7817, + "step": 5068000 + }, + { + "epoch": 1.5756170272954628, + "grad_norm": 10.094643592834473, + "learning_rate": 2.373971621174229e-05, + "loss": 2.7736, + "step": 5068500 + }, + { + "epoch": 1.5757724595759497, + "grad_norm": 10.724847793579102, + "learning_rate": 2.3737125673734172e-05, + "loss": 2.7967, + "step": 5069000 + }, + { + "epoch": 1.5759278918564366, + "grad_norm": 9.194594383239746, + "learning_rate": 2.373453513572606e-05, + "loss": 2.7483, + "step": 5069500 + }, + { + "epoch": 1.5760833241369234, + "grad_norm": 8.952609062194824, + "learning_rate": 2.3731944597717946e-05, + "loss": 2.7901, + "step": 5070000 + }, + { + "epoch": 1.5762387564174103, + "grad_norm": 9.291343688964844, + "learning_rate": 2.372935405970983e-05, + "loss": 2.7537, + "step": 5070500 + }, + { + "epoch": 1.5763941886978972, + "grad_norm": 16.933223724365234, + "learning_rate": 2.3726763521701714e-05, + "loss": 2.7962, + "step": 5071000 + }, + { + "epoch": 1.576549620978384, + "grad_norm": 12.645269393920898, + "learning_rate": 2.37241729836936e-05, + "loss": 2.7512, + "step": 5071500 + }, + { + "epoch": 1.576705053258871, + "grad_norm": 7.976224422454834, + "learning_rate": 2.3721582445685485e-05, + "loss": 2.8221, + "step": 5072000 + }, + { + "epoch": 1.5768604855393578, + "grad_norm": 9.418681144714355, + "learning_rate": 2.3718991907677372e-05, + "loss": 2.8002, + "step": 5072500 + }, + { + "epoch": 1.5770159178198446, + "grad_norm": 8.254190444946289, + "learning_rate": 2.3716401369669256e-05, + "loss": 2.7349, + "step": 5073000 + }, + { + "epoch": 1.5771713501003315, + "grad_norm": 8.937939643859863, + "learning_rate": 2.3713810831661143e-05, + "loss": 2.7615, + "step": 5073500 + }, + { + "epoch": 1.5773267823808184, + "grad_norm": 8.446179389953613, + "learning_rate": 2.3711220293653027e-05, + "loss": 2.7632, + "step": 5074000 + }, + { + "epoch": 1.5774822146613054, + "grad_norm": 9.059488296508789, + "learning_rate": 2.370862975564491e-05, + "loss": 2.7421, + "step": 5074500 + }, + { + "epoch": 1.5776376469417923, + "grad_norm": 8.872584342956543, + "learning_rate": 2.37060392176368e-05, + "loss": 2.753, + "step": 5075000 + }, + { + "epoch": 1.5777930792222792, + "grad_norm": 8.823330879211426, + "learning_rate": 2.3703448679628685e-05, + "loss": 2.785, + "step": 5075500 + }, + { + "epoch": 1.577948511502766, + "grad_norm": 13.172499656677246, + "learning_rate": 2.370085814162057e-05, + "loss": 2.8317, + "step": 5076000 + }, + { + "epoch": 1.578103943783253, + "grad_norm": 8.245124816894531, + "learning_rate": 2.3698267603612452e-05, + "loss": 2.7424, + "step": 5076500 + }, + { + "epoch": 1.5782593760637398, + "grad_norm": 8.802189826965332, + "learning_rate": 2.369567706560434e-05, + "loss": 2.7374, + "step": 5077000 + }, + { + "epoch": 1.5784148083442266, + "grad_norm": 14.46467113494873, + "learning_rate": 2.3693086527596227e-05, + "loss": 2.7885, + "step": 5077500 + }, + { + "epoch": 1.5785702406247135, + "grad_norm": 7.778702259063721, + "learning_rate": 2.369049598958811e-05, + "loss": 2.7574, + "step": 5078000 + }, + { + "epoch": 1.5787256729052004, + "grad_norm": 10.290538787841797, + "learning_rate": 2.3687905451579998e-05, + "loss": 2.7279, + "step": 5078500 + }, + { + "epoch": 1.5788811051856873, + "grad_norm": 8.30142879486084, + "learning_rate": 2.368531491357188e-05, + "loss": 2.7513, + "step": 5079000 + }, + { + "epoch": 1.5790365374661741, + "grad_norm": 9.611917495727539, + "learning_rate": 2.3682724375563765e-05, + "loss": 2.735, + "step": 5079500 + }, + { + "epoch": 1.579191969746661, + "grad_norm": 8.601210594177246, + "learning_rate": 2.3680133837555652e-05, + "loss": 2.7757, + "step": 5080000 + }, + { + "epoch": 1.5793474020271479, + "grad_norm": 15.776715278625488, + "learning_rate": 2.367754329954754e-05, + "loss": 2.7952, + "step": 5080500 + }, + { + "epoch": 1.5795028343076347, + "grad_norm": 9.577768325805664, + "learning_rate": 2.3674952761539423e-05, + "loss": 2.7554, + "step": 5081000 + }, + { + "epoch": 1.5796582665881216, + "grad_norm": 9.028349876403809, + "learning_rate": 2.3672362223531307e-05, + "loss": 2.8105, + "step": 5081500 + }, + { + "epoch": 1.5798136988686085, + "grad_norm": 7.654994487762451, + "learning_rate": 2.3669771685523194e-05, + "loss": 2.7829, + "step": 5082000 + }, + { + "epoch": 1.5799691311490953, + "grad_norm": 10.366418838500977, + "learning_rate": 2.366718114751508e-05, + "loss": 2.782, + "step": 5082500 + }, + { + "epoch": 1.5801245634295822, + "grad_norm": 9.305604934692383, + "learning_rate": 2.3664590609506965e-05, + "loss": 2.7952, + "step": 5083000 + }, + { + "epoch": 1.580279995710069, + "grad_norm": 12.547201156616211, + "learning_rate": 2.366200007149885e-05, + "loss": 2.7505, + "step": 5083500 + }, + { + "epoch": 1.580435427990556, + "grad_norm": 8.975196838378906, + "learning_rate": 2.3659409533490736e-05, + "loss": 2.7599, + "step": 5084000 + }, + { + "epoch": 1.5805908602710428, + "grad_norm": 7.202384948730469, + "learning_rate": 2.365681899548262e-05, + "loss": 2.7634, + "step": 5084500 + }, + { + "epoch": 1.5807462925515297, + "grad_norm": 10.035125732421875, + "learning_rate": 2.3654228457474507e-05, + "loss": 2.7347, + "step": 5085000 + }, + { + "epoch": 1.5809017248320165, + "grad_norm": 7.432109832763672, + "learning_rate": 2.365163791946639e-05, + "loss": 2.7835, + "step": 5085500 + }, + { + "epoch": 1.5810571571125034, + "grad_norm": 8.496343612670898, + "learning_rate": 2.3649047381458278e-05, + "loss": 2.784, + "step": 5086000 + }, + { + "epoch": 1.5812125893929903, + "grad_norm": 10.24085521697998, + "learning_rate": 2.364645684345016e-05, + "loss": 2.7499, + "step": 5086500 + }, + { + "epoch": 1.5813680216734771, + "grad_norm": 9.201369285583496, + "learning_rate": 2.364386630544205e-05, + "loss": 2.7872, + "step": 5087000 + }, + { + "epoch": 1.581523453953964, + "grad_norm": 5.342231750488281, + "learning_rate": 2.3641275767433936e-05, + "loss": 2.7571, + "step": 5087500 + }, + { + "epoch": 1.5816788862344509, + "grad_norm": 10.893999099731445, + "learning_rate": 2.363868522942582e-05, + "loss": 2.766, + "step": 5088000 + }, + { + "epoch": 1.5818343185149377, + "grad_norm": 8.792964935302734, + "learning_rate": 2.3636094691417703e-05, + "loss": 2.7662, + "step": 5088500 + }, + { + "epoch": 1.5819897507954246, + "grad_norm": 8.242724418640137, + "learning_rate": 2.3633504153409587e-05, + "loss": 2.747, + "step": 5089000 + }, + { + "epoch": 1.5821451830759115, + "grad_norm": 9.315621376037598, + "learning_rate": 2.3630913615401478e-05, + "loss": 2.7766, + "step": 5089500 + }, + { + "epoch": 1.5823006153563983, + "grad_norm": 12.745707511901855, + "learning_rate": 2.362832307739336e-05, + "loss": 2.7656, + "step": 5090000 + }, + { + "epoch": 1.5824560476368852, + "grad_norm": 13.094696998596191, + "learning_rate": 2.3625732539385245e-05, + "loss": 2.7758, + "step": 5090500 + }, + { + "epoch": 1.582611479917372, + "grad_norm": 8.91799259185791, + "learning_rate": 2.362314200137713e-05, + "loss": 2.7631, + "step": 5091000 + }, + { + "epoch": 1.582766912197859, + "grad_norm": 9.198251724243164, + "learning_rate": 2.3620551463369016e-05, + "loss": 2.726, + "step": 5091500 + }, + { + "epoch": 1.5829223444783458, + "grad_norm": 8.993308067321777, + "learning_rate": 2.3617960925360903e-05, + "loss": 2.7656, + "step": 5092000 + }, + { + "epoch": 1.5830777767588329, + "grad_norm": 10.94028091430664, + "learning_rate": 2.3615370387352787e-05, + "loss": 2.7486, + "step": 5092500 + }, + { + "epoch": 1.5832332090393197, + "grad_norm": 5.402356147766113, + "learning_rate": 2.3612779849344674e-05, + "loss": 2.7572, + "step": 5093000 + }, + { + "epoch": 1.5833886413198066, + "grad_norm": 18.085840225219727, + "learning_rate": 2.3610189311336558e-05, + "loss": 2.7491, + "step": 5093500 + }, + { + "epoch": 1.5835440736002935, + "grad_norm": 8.131567001342773, + "learning_rate": 2.3607598773328442e-05, + "loss": 2.7324, + "step": 5094000 + }, + { + "epoch": 1.5836995058807803, + "grad_norm": 11.249000549316406, + "learning_rate": 2.360500823532033e-05, + "loss": 2.7561, + "step": 5094500 + }, + { + "epoch": 1.5838549381612672, + "grad_norm": 15.337522506713867, + "learning_rate": 2.3602417697312216e-05, + "loss": 2.7599, + "step": 5095000 + }, + { + "epoch": 1.584010370441754, + "grad_norm": 9.53431224822998, + "learning_rate": 2.35998271593041e-05, + "loss": 2.7873, + "step": 5095500 + }, + { + "epoch": 1.584165802722241, + "grad_norm": 13.281915664672852, + "learning_rate": 2.3597236621295984e-05, + "loss": 2.779, + "step": 5096000 + }, + { + "epoch": 1.5843212350027278, + "grad_norm": 10.553998947143555, + "learning_rate": 2.359464608328787e-05, + "loss": 2.707, + "step": 5096500 + }, + { + "epoch": 1.5844766672832147, + "grad_norm": 16.1261043548584, + "learning_rate": 2.3592055545279758e-05, + "loss": 2.7946, + "step": 5097000 + }, + { + "epoch": 1.5846320995637015, + "grad_norm": 6.582859039306641, + "learning_rate": 2.358946500727164e-05, + "loss": 2.7484, + "step": 5097500 + }, + { + "epoch": 1.5847875318441884, + "grad_norm": 9.084176063537598, + "learning_rate": 2.3586874469263525e-05, + "loss": 2.7922, + "step": 5098000 + }, + { + "epoch": 1.5849429641246755, + "grad_norm": 9.450811386108398, + "learning_rate": 2.3584283931255413e-05, + "loss": 2.7255, + "step": 5098500 + }, + { + "epoch": 1.5850983964051624, + "grad_norm": 7.686408519744873, + "learning_rate": 2.3581693393247296e-05, + "loss": 2.7806, + "step": 5099000 + }, + { + "epoch": 1.5852538286856492, + "grad_norm": 8.672663688659668, + "learning_rate": 2.3579102855239183e-05, + "loss": 2.7717, + "step": 5099500 + }, + { + "epoch": 1.585409260966136, + "grad_norm": 9.185370445251465, + "learning_rate": 2.3576512317231067e-05, + "loss": 2.7981, + "step": 5100000 + }, + { + "epoch": 1.585564693246623, + "grad_norm": 8.623221397399902, + "learning_rate": 2.3573921779222954e-05, + "loss": 2.7962, + "step": 5100500 + }, + { + "epoch": 1.5857201255271098, + "grad_norm": 10.661903381347656, + "learning_rate": 2.3571331241214838e-05, + "loss": 2.7705, + "step": 5101000 + }, + { + "epoch": 1.5858755578075967, + "grad_norm": 10.341546058654785, + "learning_rate": 2.3568740703206725e-05, + "loss": 2.7619, + "step": 5101500 + }, + { + "epoch": 1.5860309900880836, + "grad_norm": 9.655685424804688, + "learning_rate": 2.3566150165198612e-05, + "loss": 2.8001, + "step": 5102000 + }, + { + "epoch": 1.5861864223685704, + "grad_norm": 12.160626411437988, + "learning_rate": 2.3563559627190496e-05, + "loss": 2.7818, + "step": 5102500 + }, + { + "epoch": 1.5863418546490573, + "grad_norm": 5.9725823402404785, + "learning_rate": 2.356096908918238e-05, + "loss": 2.7926, + "step": 5103000 + }, + { + "epoch": 1.5864972869295442, + "grad_norm": 10.216863632202148, + "learning_rate": 2.3558378551174264e-05, + "loss": 2.7127, + "step": 5103500 + }, + { + "epoch": 1.586652719210031, + "grad_norm": 6.958423614501953, + "learning_rate": 2.355578801316615e-05, + "loss": 2.7479, + "step": 5104000 + }, + { + "epoch": 1.586808151490518, + "grad_norm": 8.740806579589844, + "learning_rate": 2.3553197475158038e-05, + "loss": 2.6905, + "step": 5104500 + }, + { + "epoch": 1.5869635837710048, + "grad_norm": 9.847407341003418, + "learning_rate": 2.3550606937149922e-05, + "loss": 2.7791, + "step": 5105000 + }, + { + "epoch": 1.5871190160514916, + "grad_norm": 9.04459285736084, + "learning_rate": 2.354801639914181e-05, + "loss": 2.7194, + "step": 5105500 + }, + { + "epoch": 1.5872744483319785, + "grad_norm": 8.633233070373535, + "learning_rate": 2.3545425861133693e-05, + "loss": 2.7828, + "step": 5106000 + }, + { + "epoch": 1.5874298806124654, + "grad_norm": 11.530259132385254, + "learning_rate": 2.354283532312558e-05, + "loss": 2.8015, + "step": 5106500 + }, + { + "epoch": 1.5875853128929522, + "grad_norm": 9.056884765625, + "learning_rate": 2.3540244785117464e-05, + "loss": 2.718, + "step": 5107000 + }, + { + "epoch": 1.587740745173439, + "grad_norm": 9.96107006072998, + "learning_rate": 2.353765424710935e-05, + "loss": 2.7546, + "step": 5107500 + }, + { + "epoch": 1.587896177453926, + "grad_norm": 10.107781410217285, + "learning_rate": 2.3535063709101235e-05, + "loss": 2.7795, + "step": 5108000 + }, + { + "epoch": 1.5880516097344128, + "grad_norm": 7.315760135650635, + "learning_rate": 2.353247317109312e-05, + "loss": 2.7899, + "step": 5108500 + }, + { + "epoch": 1.5882070420148997, + "grad_norm": 11.28482723236084, + "learning_rate": 2.3529882633085005e-05, + "loss": 2.7736, + "step": 5109000 + }, + { + "epoch": 1.5883624742953866, + "grad_norm": 7.930294036865234, + "learning_rate": 2.3527292095076893e-05, + "loss": 2.7406, + "step": 5109500 + }, + { + "epoch": 1.5885179065758734, + "grad_norm": 16.86952781677246, + "learning_rate": 2.3524701557068776e-05, + "loss": 2.7776, + "step": 5110000 + }, + { + "epoch": 1.5886733388563603, + "grad_norm": 14.265830993652344, + "learning_rate": 2.352211101906066e-05, + "loss": 2.7553, + "step": 5110500 + }, + { + "epoch": 1.5888287711368472, + "grad_norm": 10.723340034484863, + "learning_rate": 2.3519520481052547e-05, + "loss": 2.7375, + "step": 5111000 + }, + { + "epoch": 1.588984203417334, + "grad_norm": 9.377970695495605, + "learning_rate": 2.3516929943044434e-05, + "loss": 2.7613, + "step": 5111500 + }, + { + "epoch": 1.589139635697821, + "grad_norm": 10.667738914489746, + "learning_rate": 2.3514339405036318e-05, + "loss": 2.7406, + "step": 5112000 + }, + { + "epoch": 1.5892950679783078, + "grad_norm": 9.714787483215332, + "learning_rate": 2.3511748867028202e-05, + "loss": 2.7596, + "step": 5112500 + }, + { + "epoch": 1.5894505002587946, + "grad_norm": 10.52643871307373, + "learning_rate": 2.350915832902009e-05, + "loss": 2.7819, + "step": 5113000 + }, + { + "epoch": 1.5896059325392815, + "grad_norm": 9.601496696472168, + "learning_rate": 2.3506567791011973e-05, + "loss": 2.696, + "step": 5113500 + }, + { + "epoch": 1.5897613648197684, + "grad_norm": 9.367411613464355, + "learning_rate": 2.350397725300386e-05, + "loss": 2.7552, + "step": 5114000 + }, + { + "epoch": 1.5899167971002552, + "grad_norm": 10.650286674499512, + "learning_rate": 2.3501386714995747e-05, + "loss": 2.7473, + "step": 5114500 + }, + { + "epoch": 1.590072229380742, + "grad_norm": 7.981941223144531, + "learning_rate": 2.349879617698763e-05, + "loss": 2.7945, + "step": 5115000 + }, + { + "epoch": 1.590227661661229, + "grad_norm": 5.601129055023193, + "learning_rate": 2.3496205638979515e-05, + "loss": 2.7561, + "step": 5115500 + }, + { + "epoch": 1.5903830939417158, + "grad_norm": 9.49743366241455, + "learning_rate": 2.34936151009714e-05, + "loss": 2.7455, + "step": 5116000 + }, + { + "epoch": 1.590538526222203, + "grad_norm": 7.854641914367676, + "learning_rate": 2.349102456296329e-05, + "loss": 2.753, + "step": 5116500 + }, + { + "epoch": 1.5906939585026898, + "grad_norm": 12.785677909851074, + "learning_rate": 2.3488434024955173e-05, + "loss": 2.7575, + "step": 5117000 + }, + { + "epoch": 1.5908493907831767, + "grad_norm": 9.618005752563477, + "learning_rate": 2.3485843486947057e-05, + "loss": 2.7666, + "step": 5117500 + }, + { + "epoch": 1.5910048230636635, + "grad_norm": 11.046232223510742, + "learning_rate": 2.348325294893894e-05, + "loss": 2.7818, + "step": 5118000 + }, + { + "epoch": 1.5911602553441504, + "grad_norm": 14.06129264831543, + "learning_rate": 2.3480662410930828e-05, + "loss": 2.803, + "step": 5118500 + }, + { + "epoch": 1.5913156876246373, + "grad_norm": 11.177227020263672, + "learning_rate": 2.3478071872922715e-05, + "loss": 2.7671, + "step": 5119000 + }, + { + "epoch": 1.5914711199051241, + "grad_norm": 43.01309585571289, + "learning_rate": 2.34754813349146e-05, + "loss": 2.7428, + "step": 5119500 + }, + { + "epoch": 1.591626552185611, + "grad_norm": 9.857718467712402, + "learning_rate": 2.3472890796906486e-05, + "loss": 2.7258, + "step": 5120000 + }, + { + "epoch": 1.5917819844660979, + "grad_norm": 12.214540481567383, + "learning_rate": 2.347030025889837e-05, + "loss": 2.7845, + "step": 5120500 + }, + { + "epoch": 1.5919374167465847, + "grad_norm": 8.704618453979492, + "learning_rate": 2.3467709720890253e-05, + "loss": 2.7193, + "step": 5121000 + }, + { + "epoch": 1.5920928490270716, + "grad_norm": 8.188905715942383, + "learning_rate": 2.346511918288214e-05, + "loss": 2.7351, + "step": 5121500 + }, + { + "epoch": 1.5922482813075585, + "grad_norm": 7.078772068023682, + "learning_rate": 2.3462528644874027e-05, + "loss": 2.7642, + "step": 5122000 + }, + { + "epoch": 1.5924037135880456, + "grad_norm": 13.954833984375, + "learning_rate": 2.345993810686591e-05, + "loss": 2.7197, + "step": 5122500 + }, + { + "epoch": 1.5925591458685324, + "grad_norm": 11.067407608032227, + "learning_rate": 2.3457347568857795e-05, + "loss": 2.7476, + "step": 5123000 + }, + { + "epoch": 1.5927145781490193, + "grad_norm": 8.539081573486328, + "learning_rate": 2.3454757030849682e-05, + "loss": 2.7423, + "step": 5123500 + }, + { + "epoch": 1.5928700104295062, + "grad_norm": 9.197466850280762, + "learning_rate": 2.345216649284157e-05, + "loss": 2.7863, + "step": 5124000 + }, + { + "epoch": 1.593025442709993, + "grad_norm": 9.027262687683105, + "learning_rate": 2.3449575954833453e-05, + "loss": 2.7432, + "step": 5124500 + }, + { + "epoch": 1.59318087499048, + "grad_norm": 10.771114349365234, + "learning_rate": 2.3446985416825337e-05, + "loss": 2.7604, + "step": 5125000 + }, + { + "epoch": 1.5933363072709668, + "grad_norm": 9.259749412536621, + "learning_rate": 2.3444394878817224e-05, + "loss": 2.7511, + "step": 5125500 + }, + { + "epoch": 1.5934917395514536, + "grad_norm": 11.984267234802246, + "learning_rate": 2.3441804340809108e-05, + "loss": 2.7034, + "step": 5126000 + }, + { + "epoch": 1.5936471718319405, + "grad_norm": 11.096942901611328, + "learning_rate": 2.3439213802800995e-05, + "loss": 2.7371, + "step": 5126500 + }, + { + "epoch": 1.5938026041124274, + "grad_norm": 8.677177429199219, + "learning_rate": 2.343662326479288e-05, + "loss": 2.766, + "step": 5127000 + }, + { + "epoch": 1.5939580363929142, + "grad_norm": 8.73959732055664, + "learning_rate": 2.3434032726784766e-05, + "loss": 2.7617, + "step": 5127500 + }, + { + "epoch": 1.594113468673401, + "grad_norm": 8.649377822875977, + "learning_rate": 2.343144218877665e-05, + "loss": 2.743, + "step": 5128000 + }, + { + "epoch": 1.594268900953888, + "grad_norm": 19.18834686279297, + "learning_rate": 2.3428851650768537e-05, + "loss": 2.6975, + "step": 5128500 + }, + { + "epoch": 1.5944243332343748, + "grad_norm": 9.701581001281738, + "learning_rate": 2.3426261112760424e-05, + "loss": 2.7799, + "step": 5129000 + }, + { + "epoch": 1.5945797655148617, + "grad_norm": 22.096420288085938, + "learning_rate": 2.3423670574752308e-05, + "loss": 2.7412, + "step": 5129500 + }, + { + "epoch": 1.5947351977953486, + "grad_norm": 8.358189582824707, + "learning_rate": 2.342108003674419e-05, + "loss": 2.8096, + "step": 5130000 + }, + { + "epoch": 1.5948906300758354, + "grad_norm": 9.088740348815918, + "learning_rate": 2.3418489498736075e-05, + "loss": 2.7714, + "step": 5130500 + }, + { + "epoch": 1.5950460623563223, + "grad_norm": 8.894735336303711, + "learning_rate": 2.3415898960727962e-05, + "loss": 2.6956, + "step": 5131000 + }, + { + "epoch": 1.5952014946368092, + "grad_norm": 9.228143692016602, + "learning_rate": 2.341330842271985e-05, + "loss": 2.7366, + "step": 5131500 + }, + { + "epoch": 1.595356926917296, + "grad_norm": 9.461597442626953, + "learning_rate": 2.3410717884711733e-05, + "loss": 2.7465, + "step": 5132000 + }, + { + "epoch": 1.595512359197783, + "grad_norm": 9.355183601379395, + "learning_rate": 2.340812734670362e-05, + "loss": 2.7402, + "step": 5132500 + }, + { + "epoch": 1.5956677914782698, + "grad_norm": 10.21861743927002, + "learning_rate": 2.3405536808695504e-05, + "loss": 2.7529, + "step": 5133000 + }, + { + "epoch": 1.5958232237587566, + "grad_norm": 8.148193359375, + "learning_rate": 2.340294627068739e-05, + "loss": 2.7602, + "step": 5133500 + }, + { + "epoch": 1.5959786560392435, + "grad_norm": 9.740922927856445, + "learning_rate": 2.3400355732679275e-05, + "loss": 2.6988, + "step": 5134000 + }, + { + "epoch": 1.5961340883197304, + "grad_norm": 8.007722854614258, + "learning_rate": 2.3397765194671162e-05, + "loss": 2.7947, + "step": 5134500 + }, + { + "epoch": 1.5962895206002172, + "grad_norm": 8.701258659362793, + "learning_rate": 2.3395174656663046e-05, + "loss": 2.759, + "step": 5135000 + }, + { + "epoch": 1.596444952880704, + "grad_norm": 42.883323669433594, + "learning_rate": 2.339258411865493e-05, + "loss": 2.7813, + "step": 5135500 + }, + { + "epoch": 1.596600385161191, + "grad_norm": 18.577592849731445, + "learning_rate": 2.3389993580646817e-05, + "loss": 2.7825, + "step": 5136000 + }, + { + "epoch": 1.5967558174416778, + "grad_norm": 12.176203727722168, + "learning_rate": 2.3387403042638704e-05, + "loss": 2.7559, + "step": 5136500 + }, + { + "epoch": 1.5969112497221647, + "grad_norm": 12.071855545043945, + "learning_rate": 2.3384812504630588e-05, + "loss": 2.7236, + "step": 5137000 + }, + { + "epoch": 1.5970666820026516, + "grad_norm": 8.104104995727539, + "learning_rate": 2.338222196662247e-05, + "loss": 2.7811, + "step": 5137500 + }, + { + "epoch": 1.5972221142831384, + "grad_norm": 19.671079635620117, + "learning_rate": 2.337963142861436e-05, + "loss": 2.817, + "step": 5138000 + }, + { + "epoch": 1.5973775465636253, + "grad_norm": 8.858311653137207, + "learning_rate": 2.3377040890606246e-05, + "loss": 2.8032, + "step": 5138500 + }, + { + "epoch": 1.5975329788441122, + "grad_norm": 16.511592864990234, + "learning_rate": 2.337445035259813e-05, + "loss": 2.7956, + "step": 5139000 + }, + { + "epoch": 1.597688411124599, + "grad_norm": 8.693007469177246, + "learning_rate": 2.3371859814590013e-05, + "loss": 2.7026, + "step": 5139500 + }, + { + "epoch": 1.597843843405086, + "grad_norm": 20.330659866333008, + "learning_rate": 2.33692692765819e-05, + "loss": 2.7188, + "step": 5140000 + }, + { + "epoch": 1.597999275685573, + "grad_norm": 13.827223777770996, + "learning_rate": 2.3366678738573784e-05, + "loss": 2.7511, + "step": 5140500 + }, + { + "epoch": 1.5981547079660599, + "grad_norm": 8.430691719055176, + "learning_rate": 2.336408820056567e-05, + "loss": 2.7452, + "step": 5141000 + }, + { + "epoch": 1.5983101402465467, + "grad_norm": 12.840394020080566, + "learning_rate": 2.336149766255756e-05, + "loss": 2.8207, + "step": 5141500 + }, + { + "epoch": 1.5984655725270336, + "grad_norm": 9.532419204711914, + "learning_rate": 2.3358907124549442e-05, + "loss": 2.8177, + "step": 5142000 + }, + { + "epoch": 1.5986210048075205, + "grad_norm": 11.722804069519043, + "learning_rate": 2.3356316586541326e-05, + "loss": 2.7565, + "step": 5142500 + }, + { + "epoch": 1.5987764370880073, + "grad_norm": 9.917089462280273, + "learning_rate": 2.335372604853321e-05, + "loss": 2.7316, + "step": 5143000 + }, + { + "epoch": 1.5989318693684942, + "grad_norm": 9.268190383911133, + "learning_rate": 2.33511355105251e-05, + "loss": 2.7693, + "step": 5143500 + }, + { + "epoch": 1.599087301648981, + "grad_norm": 8.609796524047852, + "learning_rate": 2.3348544972516984e-05, + "loss": 2.7959, + "step": 5144000 + }, + { + "epoch": 1.599242733929468, + "grad_norm": 76.2782974243164, + "learning_rate": 2.3345954434508868e-05, + "loss": 2.7787, + "step": 5144500 + }, + { + "epoch": 1.5993981662099548, + "grad_norm": 8.443098068237305, + "learning_rate": 2.3343363896500752e-05, + "loss": 2.7834, + "step": 5145000 + }, + { + "epoch": 1.5995535984904417, + "grad_norm": 9.611163139343262, + "learning_rate": 2.334077335849264e-05, + "loss": 2.7139, + "step": 5145500 + }, + { + "epoch": 1.5997090307709285, + "grad_norm": 12.001129150390625, + "learning_rate": 2.3338182820484526e-05, + "loss": 2.7644, + "step": 5146000 + }, + { + "epoch": 1.5998644630514156, + "grad_norm": 11.76477336883545, + "learning_rate": 2.333559228247641e-05, + "loss": 2.7093, + "step": 5146500 + }, + { + "epoch": 1.6000198953319025, + "grad_norm": 9.910595893859863, + "learning_rate": 2.3333001744468297e-05, + "loss": 2.7761, + "step": 5147000 + }, + { + "epoch": 1.6001753276123893, + "grad_norm": 9.05434513092041, + "learning_rate": 2.333041120646018e-05, + "loss": 2.7586, + "step": 5147500 + }, + { + "epoch": 1.6003307598928762, + "grad_norm": 10.208993911743164, + "learning_rate": 2.3327820668452065e-05, + "loss": 2.7771, + "step": 5148000 + }, + { + "epoch": 1.600486192173363, + "grad_norm": 9.503155708312988, + "learning_rate": 2.332523013044395e-05, + "loss": 2.79, + "step": 5148500 + }, + { + "epoch": 1.60064162445385, + "grad_norm": 10.145907402038574, + "learning_rate": 2.332263959243584e-05, + "loss": 2.7611, + "step": 5149000 + }, + { + "epoch": 1.6007970567343368, + "grad_norm": 8.614574432373047, + "learning_rate": 2.3320049054427723e-05, + "loss": 2.7331, + "step": 5149500 + }, + { + "epoch": 1.6009524890148237, + "grad_norm": 7.378167629241943, + "learning_rate": 2.3317458516419606e-05, + "loss": 2.7986, + "step": 5150000 + }, + { + "epoch": 1.6011079212953105, + "grad_norm": 16.147743225097656, + "learning_rate": 2.3314867978411494e-05, + "loss": 2.7954, + "step": 5150500 + }, + { + "epoch": 1.6012633535757974, + "grad_norm": 9.320904731750488, + "learning_rate": 2.331227744040338e-05, + "loss": 2.8031, + "step": 5151000 + }, + { + "epoch": 1.6014187858562843, + "grad_norm": 10.569913864135742, + "learning_rate": 2.3309686902395264e-05, + "loss": 2.7825, + "step": 5151500 + }, + { + "epoch": 1.6015742181367711, + "grad_norm": 10.21893310546875, + "learning_rate": 2.3307096364387148e-05, + "loss": 2.7531, + "step": 5152000 + }, + { + "epoch": 1.601729650417258, + "grad_norm": 8.46308708190918, + "learning_rate": 2.3304505826379035e-05, + "loss": 2.7281, + "step": 5152500 + }, + { + "epoch": 1.6018850826977449, + "grad_norm": 29.567584991455078, + "learning_rate": 2.330191528837092e-05, + "loss": 2.822, + "step": 5153000 + }, + { + "epoch": 1.6020405149782317, + "grad_norm": 10.263615608215332, + "learning_rate": 2.3299324750362806e-05, + "loss": 2.7607, + "step": 5153500 + }, + { + "epoch": 1.6021959472587186, + "grad_norm": 8.317065238952637, + "learning_rate": 2.329673421235469e-05, + "loss": 2.7637, + "step": 5154000 + }, + { + "epoch": 1.6023513795392055, + "grad_norm": 16.357284545898438, + "learning_rate": 2.3294143674346577e-05, + "loss": 2.758, + "step": 5154500 + }, + { + "epoch": 1.6025068118196923, + "grad_norm": 14.080825805664062, + "learning_rate": 2.329155313633846e-05, + "loss": 2.7922, + "step": 5155000 + }, + { + "epoch": 1.6026622441001792, + "grad_norm": 9.083062171936035, + "learning_rate": 2.3288962598330348e-05, + "loss": 2.8042, + "step": 5155500 + }, + { + "epoch": 1.602817676380666, + "grad_norm": 8.452576637268066, + "learning_rate": 2.3286372060322235e-05, + "loss": 2.7134, + "step": 5156000 + }, + { + "epoch": 1.602973108661153, + "grad_norm": 8.5770845413208, + "learning_rate": 2.328378152231412e-05, + "loss": 2.8019, + "step": 5156500 + }, + { + "epoch": 1.6031285409416398, + "grad_norm": 13.803182601928711, + "learning_rate": 2.3281190984306003e-05, + "loss": 2.782, + "step": 5157000 + }, + { + "epoch": 1.6032839732221267, + "grad_norm": 8.432373046875, + "learning_rate": 2.3278600446297887e-05, + "loss": 2.7684, + "step": 5157500 + }, + { + "epoch": 1.6034394055026135, + "grad_norm": 10.50624942779541, + "learning_rate": 2.3276009908289774e-05, + "loss": 2.8138, + "step": 5158000 + }, + { + "epoch": 1.6035948377831004, + "grad_norm": 11.583072662353516, + "learning_rate": 2.327341937028166e-05, + "loss": 2.7564, + "step": 5158500 + }, + { + "epoch": 1.6037502700635873, + "grad_norm": 11.585800170898438, + "learning_rate": 2.3270828832273545e-05, + "loss": 2.7552, + "step": 5159000 + }, + { + "epoch": 1.6039057023440741, + "grad_norm": 10.54979133605957, + "learning_rate": 2.3268238294265432e-05, + "loss": 2.8097, + "step": 5159500 + }, + { + "epoch": 1.604061134624561, + "grad_norm": 10.221659660339355, + "learning_rate": 2.3265647756257316e-05, + "loss": 2.7836, + "step": 5160000 + }, + { + "epoch": 1.6042165669050479, + "grad_norm": 8.32010269165039, + "learning_rate": 2.3263057218249203e-05, + "loss": 2.7811, + "step": 5160500 + }, + { + "epoch": 1.6043719991855347, + "grad_norm": 8.88825511932373, + "learning_rate": 2.3260466680241086e-05, + "loss": 2.7424, + "step": 5161000 + }, + { + "epoch": 1.6045274314660216, + "grad_norm": 9.089875221252441, + "learning_rate": 2.3257876142232974e-05, + "loss": 2.7862, + "step": 5161500 + }, + { + "epoch": 1.6046828637465085, + "grad_norm": 8.243586540222168, + "learning_rate": 2.3255285604224857e-05, + "loss": 2.7808, + "step": 5162000 + }, + { + "epoch": 1.6048382960269953, + "grad_norm": 16.014175415039062, + "learning_rate": 2.325269506621674e-05, + "loss": 2.8055, + "step": 5162500 + }, + { + "epoch": 1.6049937283074822, + "grad_norm": 10.49732494354248, + "learning_rate": 2.325010452820863e-05, + "loss": 2.7615, + "step": 5163000 + }, + { + "epoch": 1.605149160587969, + "grad_norm": 35.15034103393555, + "learning_rate": 2.3247513990200515e-05, + "loss": 2.7997, + "step": 5163500 + }, + { + "epoch": 1.605304592868456, + "grad_norm": 9.23162841796875, + "learning_rate": 2.32449234521924e-05, + "loss": 2.7613, + "step": 5164000 + }, + { + "epoch": 1.605460025148943, + "grad_norm": 13.98612117767334, + "learning_rate": 2.3242332914184283e-05, + "loss": 2.7374, + "step": 5164500 + }, + { + "epoch": 1.60561545742943, + "grad_norm": 8.423091888427734, + "learning_rate": 2.323974237617617e-05, + "loss": 2.772, + "step": 5165000 + }, + { + "epoch": 1.6057708897099168, + "grad_norm": 8.941024780273438, + "learning_rate": 2.3237151838168057e-05, + "loss": 2.7675, + "step": 5165500 + }, + { + "epoch": 1.6059263219904036, + "grad_norm": 7.433272838592529, + "learning_rate": 2.323456130015994e-05, + "loss": 2.7615, + "step": 5166000 + }, + { + "epoch": 1.6060817542708905, + "grad_norm": 7.828085422515869, + "learning_rate": 2.3231970762151825e-05, + "loss": 2.7932, + "step": 5166500 + }, + { + "epoch": 1.6062371865513774, + "grad_norm": 11.484552383422852, + "learning_rate": 2.3229380224143712e-05, + "loss": 2.7342, + "step": 5167000 + }, + { + "epoch": 1.6063926188318642, + "grad_norm": 11.080145835876465, + "learning_rate": 2.3226789686135596e-05, + "loss": 2.7804, + "step": 5167500 + }, + { + "epoch": 1.606548051112351, + "grad_norm": 10.185639381408691, + "learning_rate": 2.3224199148127483e-05, + "loss": 2.7806, + "step": 5168000 + }, + { + "epoch": 1.606703483392838, + "grad_norm": 8.892071723937988, + "learning_rate": 2.322160861011937e-05, + "loss": 2.7381, + "step": 5168500 + }, + { + "epoch": 1.6068589156733248, + "grad_norm": 11.334973335266113, + "learning_rate": 2.3219018072111254e-05, + "loss": 2.7268, + "step": 5169000 + }, + { + "epoch": 1.6070143479538117, + "grad_norm": 11.90125560760498, + "learning_rate": 2.3216427534103138e-05, + "loss": 2.7747, + "step": 5169500 + }, + { + "epoch": 1.6071697802342986, + "grad_norm": 9.307464599609375, + "learning_rate": 2.321383699609502e-05, + "loss": 2.7209, + "step": 5170000 + }, + { + "epoch": 1.6073252125147857, + "grad_norm": 10.058268547058105, + "learning_rate": 2.3211246458086912e-05, + "loss": 2.8192, + "step": 5170500 + }, + { + "epoch": 1.6074806447952725, + "grad_norm": 10.1842041015625, + "learning_rate": 2.3208655920078796e-05, + "loss": 2.7362, + "step": 5171000 + }, + { + "epoch": 1.6076360770757594, + "grad_norm": 11.752290725708008, + "learning_rate": 2.320606538207068e-05, + "loss": 2.7819, + "step": 5171500 + }, + { + "epoch": 1.6077915093562463, + "grad_norm": 35.31758117675781, + "learning_rate": 2.3203474844062563e-05, + "loss": 2.7752, + "step": 5172000 + }, + { + "epoch": 1.6079469416367331, + "grad_norm": 5.920510768890381, + "learning_rate": 2.320088430605445e-05, + "loss": 2.7178, + "step": 5172500 + }, + { + "epoch": 1.60810237391722, + "grad_norm": 7.6661272048950195, + "learning_rate": 2.3198293768046338e-05, + "loss": 2.7524, + "step": 5173000 + }, + { + "epoch": 1.6082578061977069, + "grad_norm": 8.730644226074219, + "learning_rate": 2.319570323003822e-05, + "loss": 2.8139, + "step": 5173500 + }, + { + "epoch": 1.6084132384781937, + "grad_norm": 11.74277400970459, + "learning_rate": 2.319311269203011e-05, + "loss": 2.7361, + "step": 5174000 + }, + { + "epoch": 1.6085686707586806, + "grad_norm": 7.106884479522705, + "learning_rate": 2.3190522154021992e-05, + "loss": 2.7748, + "step": 5174500 + }, + { + "epoch": 1.6087241030391675, + "grad_norm": 8.357766151428223, + "learning_rate": 2.3187931616013876e-05, + "loss": 2.7357, + "step": 5175000 + }, + { + "epoch": 1.6088795353196543, + "grad_norm": 21.714296340942383, + "learning_rate": 2.3185341078005763e-05, + "loss": 2.733, + "step": 5175500 + }, + { + "epoch": 1.6090349676001412, + "grad_norm": 9.112799644470215, + "learning_rate": 2.318275053999765e-05, + "loss": 2.7112, + "step": 5176000 + }, + { + "epoch": 1.609190399880628, + "grad_norm": 7.542612075805664, + "learning_rate": 2.3180160001989534e-05, + "loss": 2.7092, + "step": 5176500 + }, + { + "epoch": 1.609345832161115, + "grad_norm": 8.688337326049805, + "learning_rate": 2.3177569463981418e-05, + "loss": 2.7617, + "step": 5177000 + }, + { + "epoch": 1.6095012644416018, + "grad_norm": 12.133874893188477, + "learning_rate": 2.3174978925973305e-05, + "loss": 2.7871, + "step": 5177500 + }, + { + "epoch": 1.6096566967220887, + "grad_norm": 7.730568885803223, + "learning_rate": 2.3172388387965192e-05, + "loss": 2.7778, + "step": 5178000 + }, + { + "epoch": 1.6098121290025755, + "grad_norm": 12.953596115112305, + "learning_rate": 2.3169797849957076e-05, + "loss": 2.7596, + "step": 5178500 + }, + { + "epoch": 1.6099675612830624, + "grad_norm": 9.184362411499023, + "learning_rate": 2.316720731194896e-05, + "loss": 2.7369, + "step": 5179000 + }, + { + "epoch": 1.6101229935635493, + "grad_norm": 11.554825782775879, + "learning_rate": 2.3164616773940847e-05, + "loss": 2.7967, + "step": 5179500 + }, + { + "epoch": 1.6102784258440361, + "grad_norm": 8.681572914123535, + "learning_rate": 2.3162026235932734e-05, + "loss": 2.7664, + "step": 5180000 + }, + { + "epoch": 1.610433858124523, + "grad_norm": 8.326164245605469, + "learning_rate": 2.3159435697924618e-05, + "loss": 2.753, + "step": 5180500 + }, + { + "epoch": 1.6105892904050099, + "grad_norm": 10.00518798828125, + "learning_rate": 2.3156845159916505e-05, + "loss": 2.7914, + "step": 5181000 + }, + { + "epoch": 1.6107447226854967, + "grad_norm": 9.648258209228516, + "learning_rate": 2.315425462190839e-05, + "loss": 2.7592, + "step": 5181500 + }, + { + "epoch": 1.6109001549659836, + "grad_norm": 10.987586975097656, + "learning_rate": 2.3151664083900272e-05, + "loss": 2.775, + "step": 5182000 + }, + { + "epoch": 1.6110555872464705, + "grad_norm": 11.515443801879883, + "learning_rate": 2.314907354589216e-05, + "loss": 2.7494, + "step": 5182500 + }, + { + "epoch": 1.6112110195269573, + "grad_norm": 10.094685554504395, + "learning_rate": 2.3146483007884047e-05, + "loss": 2.7617, + "step": 5183000 + }, + { + "epoch": 1.6113664518074442, + "grad_norm": 20.34502410888672, + "learning_rate": 2.314389246987593e-05, + "loss": 2.7488, + "step": 5183500 + }, + { + "epoch": 1.611521884087931, + "grad_norm": 9.656476974487305, + "learning_rate": 2.3141301931867814e-05, + "loss": 2.6946, + "step": 5184000 + }, + { + "epoch": 1.611677316368418, + "grad_norm": 6.460864067077637, + "learning_rate": 2.3138711393859698e-05, + "loss": 2.8015, + "step": 5184500 + }, + { + "epoch": 1.6118327486489048, + "grad_norm": 9.738685607910156, + "learning_rate": 2.313612085585159e-05, + "loss": 2.7564, + "step": 5185000 + }, + { + "epoch": 1.6119881809293917, + "grad_norm": 8.39207935333252, + "learning_rate": 2.3133530317843472e-05, + "loss": 2.7819, + "step": 5185500 + }, + { + "epoch": 1.6121436132098785, + "grad_norm": 9.510161399841309, + "learning_rate": 2.3130939779835356e-05, + "loss": 2.7535, + "step": 5186000 + }, + { + "epoch": 1.6122990454903654, + "grad_norm": 28.198640823364258, + "learning_rate": 2.3128349241827243e-05, + "loss": 2.7717, + "step": 5186500 + }, + { + "epoch": 1.6124544777708523, + "grad_norm": 9.819100379943848, + "learning_rate": 2.3125758703819127e-05, + "loss": 2.7511, + "step": 5187000 + }, + { + "epoch": 1.6126099100513391, + "grad_norm": 7.517017364501953, + "learning_rate": 2.3123168165811014e-05, + "loss": 2.7799, + "step": 5187500 + }, + { + "epoch": 1.612765342331826, + "grad_norm": 16.254148483276367, + "learning_rate": 2.3120577627802898e-05, + "loss": 2.7229, + "step": 5188000 + }, + { + "epoch": 1.6129207746123129, + "grad_norm": 12.742059707641602, + "learning_rate": 2.3117987089794785e-05, + "loss": 2.772, + "step": 5188500 + }, + { + "epoch": 1.6130762068928, + "grad_norm": 19.47271728515625, + "learning_rate": 2.311539655178667e-05, + "loss": 2.7357, + "step": 5189000 + }, + { + "epoch": 1.6132316391732868, + "grad_norm": 9.04940414428711, + "learning_rate": 2.3112806013778553e-05, + "loss": 2.7534, + "step": 5189500 + }, + { + "epoch": 1.6133870714537737, + "grad_norm": 10.414351463317871, + "learning_rate": 2.3110215475770443e-05, + "loss": 2.7559, + "step": 5190000 + }, + { + "epoch": 1.6135425037342606, + "grad_norm": 8.781489372253418, + "learning_rate": 2.3107624937762327e-05, + "loss": 2.8088, + "step": 5190500 + }, + { + "epoch": 1.6136979360147474, + "grad_norm": 13.935864448547363, + "learning_rate": 2.310503439975421e-05, + "loss": 2.742, + "step": 5191000 + }, + { + "epoch": 1.6138533682952343, + "grad_norm": 11.039144515991211, + "learning_rate": 2.3102443861746094e-05, + "loss": 2.7967, + "step": 5191500 + }, + { + "epoch": 1.6140088005757212, + "grad_norm": 8.84914493560791, + "learning_rate": 2.309985332373798e-05, + "loss": 2.7925, + "step": 5192000 + }, + { + "epoch": 1.614164232856208, + "grad_norm": 8.334202766418457, + "learning_rate": 2.309726278572987e-05, + "loss": 2.7053, + "step": 5192500 + }, + { + "epoch": 1.614319665136695, + "grad_norm": 10.346961975097656, + "learning_rate": 2.3094672247721752e-05, + "loss": 2.809, + "step": 5193000 + }, + { + "epoch": 1.6144750974171818, + "grad_norm": 12.379356384277344, + "learning_rate": 2.3092081709713636e-05, + "loss": 2.798, + "step": 5193500 + }, + { + "epoch": 1.6146305296976686, + "grad_norm": 10.678154945373535, + "learning_rate": 2.3089491171705523e-05, + "loss": 2.7305, + "step": 5194000 + }, + { + "epoch": 1.6147859619781555, + "grad_norm": 10.6973876953125, + "learning_rate": 2.3086900633697407e-05, + "loss": 2.76, + "step": 5194500 + }, + { + "epoch": 1.6149413942586426, + "grad_norm": 8.682082176208496, + "learning_rate": 2.3084310095689294e-05, + "loss": 2.7924, + "step": 5195000 + }, + { + "epoch": 1.6150968265391294, + "grad_norm": 8.192031860351562, + "learning_rate": 2.308171955768118e-05, + "loss": 2.7344, + "step": 5195500 + }, + { + "epoch": 1.6152522588196163, + "grad_norm": 19.9613037109375, + "learning_rate": 2.3079129019673065e-05, + "loss": 2.8241, + "step": 5196000 + }, + { + "epoch": 1.6154076911001032, + "grad_norm": 16.960336685180664, + "learning_rate": 2.307653848166495e-05, + "loss": 2.7318, + "step": 5196500 + }, + { + "epoch": 1.61556312338059, + "grad_norm": 9.149154663085938, + "learning_rate": 2.3073947943656836e-05, + "loss": 2.7948, + "step": 5197000 + }, + { + "epoch": 1.615718555661077, + "grad_norm": 9.555739402770996, + "learning_rate": 2.3071357405648723e-05, + "loss": 2.7781, + "step": 5197500 + }, + { + "epoch": 1.6158739879415638, + "grad_norm": 34.231903076171875, + "learning_rate": 2.3068766867640607e-05, + "loss": 2.749, + "step": 5198000 + }, + { + "epoch": 1.6160294202220506, + "grad_norm": 6.405043601989746, + "learning_rate": 2.306617632963249e-05, + "loss": 2.7375, + "step": 5198500 + }, + { + "epoch": 1.6161848525025375, + "grad_norm": 7.460251331329346, + "learning_rate": 2.3063585791624378e-05, + "loss": 2.7171, + "step": 5199000 + }, + { + "epoch": 1.6163402847830244, + "grad_norm": 15.131481170654297, + "learning_rate": 2.3060995253616262e-05, + "loss": 2.8048, + "step": 5199500 + }, + { + "epoch": 1.6164957170635113, + "grad_norm": 9.467552185058594, + "learning_rate": 2.305840471560815e-05, + "loss": 2.8158, + "step": 5200000 + }, + { + "epoch": 1.6166511493439981, + "grad_norm": 9.590119361877441, + "learning_rate": 2.3055814177600033e-05, + "loss": 2.7696, + "step": 5200500 + }, + { + "epoch": 1.616806581624485, + "grad_norm": 7.061400413513184, + "learning_rate": 2.305322363959192e-05, + "loss": 2.7944, + "step": 5201000 + }, + { + "epoch": 1.6169620139049719, + "grad_norm": 10.220072746276855, + "learning_rate": 2.3050633101583804e-05, + "loss": 2.7823, + "step": 5201500 + }, + { + "epoch": 1.6171174461854587, + "grad_norm": 18.780664443969727, + "learning_rate": 2.304804256357569e-05, + "loss": 2.7851, + "step": 5202000 + }, + { + "epoch": 1.6172728784659456, + "grad_norm": 8.188570022583008, + "learning_rate": 2.3045452025567575e-05, + "loss": 2.7478, + "step": 5202500 + }, + { + "epoch": 1.6174283107464325, + "grad_norm": 10.685011863708496, + "learning_rate": 2.304286148755946e-05, + "loss": 2.7905, + "step": 5203000 + }, + { + "epoch": 1.6175837430269193, + "grad_norm": 9.288482666015625, + "learning_rate": 2.3040270949551345e-05, + "loss": 2.7432, + "step": 5203500 + }, + { + "epoch": 1.6177391753074062, + "grad_norm": 8.173734664916992, + "learning_rate": 2.303768041154323e-05, + "loss": 2.7611, + "step": 5204000 + }, + { + "epoch": 1.617894607587893, + "grad_norm": 9.912980079650879, + "learning_rate": 2.3035089873535116e-05, + "loss": 2.7265, + "step": 5204500 + }, + { + "epoch": 1.61805003986838, + "grad_norm": 10.049778938293457, + "learning_rate": 2.3032499335527004e-05, + "loss": 2.7586, + "step": 5205000 + }, + { + "epoch": 1.6182054721488668, + "grad_norm": 88.55004119873047, + "learning_rate": 2.3029908797518887e-05, + "loss": 2.7575, + "step": 5205500 + }, + { + "epoch": 1.6183609044293537, + "grad_norm": 9.081963539123535, + "learning_rate": 2.302731825951077e-05, + "loss": 2.7689, + "step": 5206000 + }, + { + "epoch": 1.6185163367098405, + "grad_norm": 8.592275619506836, + "learning_rate": 2.3024727721502658e-05, + "loss": 2.7655, + "step": 5206500 + }, + { + "epoch": 1.6186717689903274, + "grad_norm": 6.268710613250732, + "learning_rate": 2.3022137183494545e-05, + "loss": 2.7761, + "step": 5207000 + }, + { + "epoch": 1.6188272012708143, + "grad_norm": 9.069474220275879, + "learning_rate": 2.301954664548643e-05, + "loss": 2.764, + "step": 5207500 + }, + { + "epoch": 1.6189826335513011, + "grad_norm": 8.192069053649902, + "learning_rate": 2.3016956107478316e-05, + "loss": 2.78, + "step": 5208000 + }, + { + "epoch": 1.619138065831788, + "grad_norm": 11.077088356018066, + "learning_rate": 2.30143655694702e-05, + "loss": 2.7492, + "step": 5208500 + }, + { + "epoch": 1.6192934981122749, + "grad_norm": 14.079401969909668, + "learning_rate": 2.3011775031462084e-05, + "loss": 2.7624, + "step": 5209000 + }, + { + "epoch": 1.6194489303927617, + "grad_norm": 10.124218940734863, + "learning_rate": 2.300918449345397e-05, + "loss": 2.731, + "step": 5209500 + }, + { + "epoch": 1.6196043626732486, + "grad_norm": 8.37158203125, + "learning_rate": 2.3006593955445858e-05, + "loss": 2.7745, + "step": 5210000 + }, + { + "epoch": 1.6197597949537355, + "grad_norm": 8.313687324523926, + "learning_rate": 2.3004003417437742e-05, + "loss": 2.7562, + "step": 5210500 + }, + { + "epoch": 1.6199152272342223, + "grad_norm": 8.555100440979004, + "learning_rate": 2.3001412879429626e-05, + "loss": 2.7211, + "step": 5211000 + }, + { + "epoch": 1.6200706595147092, + "grad_norm": 19.30753517150879, + "learning_rate": 2.299882234142151e-05, + "loss": 2.733, + "step": 5211500 + }, + { + "epoch": 1.620226091795196, + "grad_norm": 10.421379089355469, + "learning_rate": 2.29962318034134e-05, + "loss": 2.7786, + "step": 5212000 + }, + { + "epoch": 1.620381524075683, + "grad_norm": 16.591272354125977, + "learning_rate": 2.2993641265405284e-05, + "loss": 2.7274, + "step": 5212500 + }, + { + "epoch": 1.62053695635617, + "grad_norm": 10.22301197052002, + "learning_rate": 2.2991050727397167e-05, + "loss": 2.8183, + "step": 5213000 + }, + { + "epoch": 1.6206923886366569, + "grad_norm": 9.4662504196167, + "learning_rate": 2.2988460189389055e-05, + "loss": 2.7683, + "step": 5213500 + }, + { + "epoch": 1.6208478209171437, + "grad_norm": 8.87673568725586, + "learning_rate": 2.298586965138094e-05, + "loss": 2.7422, + "step": 5214000 + }, + { + "epoch": 1.6210032531976306, + "grad_norm": 9.74357795715332, + "learning_rate": 2.2983279113372826e-05, + "loss": 2.783, + "step": 5214500 + }, + { + "epoch": 1.6211586854781175, + "grad_norm": 9.773710250854492, + "learning_rate": 2.298068857536471e-05, + "loss": 2.7982, + "step": 5215000 + }, + { + "epoch": 1.6213141177586043, + "grad_norm": 10.850138664245605, + "learning_rate": 2.2978098037356596e-05, + "loss": 2.7022, + "step": 5215500 + }, + { + "epoch": 1.6214695500390912, + "grad_norm": 9.422279357910156, + "learning_rate": 2.297550749934848e-05, + "loss": 2.7501, + "step": 5216000 + }, + { + "epoch": 1.621624982319578, + "grad_norm": 8.039783477783203, + "learning_rate": 2.2972916961340364e-05, + "loss": 2.7578, + "step": 5216500 + }, + { + "epoch": 1.621780414600065, + "grad_norm": 8.26234245300293, + "learning_rate": 2.2970326423332255e-05, + "loss": 2.7428, + "step": 5217000 + }, + { + "epoch": 1.6219358468805518, + "grad_norm": 8.809843063354492, + "learning_rate": 2.2967735885324138e-05, + "loss": 2.7506, + "step": 5217500 + }, + { + "epoch": 1.6220912791610387, + "grad_norm": 13.405572891235352, + "learning_rate": 2.2965145347316022e-05, + "loss": 2.7311, + "step": 5218000 + }, + { + "epoch": 1.6222467114415255, + "grad_norm": 7.411009788513184, + "learning_rate": 2.2962554809307906e-05, + "loss": 2.7159, + "step": 5218500 + }, + { + "epoch": 1.6224021437220126, + "grad_norm": 8.749773025512695, + "learning_rate": 2.2959964271299793e-05, + "loss": 2.7953, + "step": 5219000 + }, + { + "epoch": 1.6225575760024995, + "grad_norm": 7.805665493011475, + "learning_rate": 2.295737373329168e-05, + "loss": 2.7614, + "step": 5219500 + }, + { + "epoch": 1.6227130082829864, + "grad_norm": 7.707053184509277, + "learning_rate": 2.2954783195283564e-05, + "loss": 2.748, + "step": 5220000 + }, + { + "epoch": 1.6228684405634732, + "grad_norm": 8.413620948791504, + "learning_rate": 2.2952192657275448e-05, + "loss": 2.7516, + "step": 5220500 + }, + { + "epoch": 1.62302387284396, + "grad_norm": 21.030284881591797, + "learning_rate": 2.2949602119267335e-05, + "loss": 2.7918, + "step": 5221000 + }, + { + "epoch": 1.623179305124447, + "grad_norm": 10.06893539428711, + "learning_rate": 2.294701158125922e-05, + "loss": 2.719, + "step": 5221500 + }, + { + "epoch": 1.6233347374049338, + "grad_norm": 10.235345840454102, + "learning_rate": 2.2944421043251106e-05, + "loss": 2.7769, + "step": 5222000 + }, + { + "epoch": 1.6234901696854207, + "grad_norm": 8.697339057922363, + "learning_rate": 2.2941830505242993e-05, + "loss": 2.7609, + "step": 5222500 + }, + { + "epoch": 1.6236456019659076, + "grad_norm": 8.542622566223145, + "learning_rate": 2.2939239967234877e-05, + "loss": 2.7596, + "step": 5223000 + }, + { + "epoch": 1.6238010342463944, + "grad_norm": 9.043251037597656, + "learning_rate": 2.293664942922676e-05, + "loss": 2.7158, + "step": 5223500 + }, + { + "epoch": 1.6239564665268813, + "grad_norm": 7.426293849945068, + "learning_rate": 2.2934058891218648e-05, + "loss": 2.7845, + "step": 5224000 + }, + { + "epoch": 1.6241118988073682, + "grad_norm": 8.971549034118652, + "learning_rate": 2.2931468353210535e-05, + "loss": 2.7817, + "step": 5224500 + }, + { + "epoch": 1.624267331087855, + "grad_norm": 18.29804229736328, + "learning_rate": 2.292887781520242e-05, + "loss": 2.7461, + "step": 5225000 + }, + { + "epoch": 1.624422763368342, + "grad_norm": 9.860267639160156, + "learning_rate": 2.2926287277194302e-05, + "loss": 2.7865, + "step": 5225500 + }, + { + "epoch": 1.6245781956488288, + "grad_norm": 11.955663681030273, + "learning_rate": 2.292369673918619e-05, + "loss": 2.7398, + "step": 5226000 + }, + { + "epoch": 1.6247336279293156, + "grad_norm": 8.55682373046875, + "learning_rate": 2.2921106201178073e-05, + "loss": 2.8063, + "step": 5226500 + }, + { + "epoch": 1.6248890602098025, + "grad_norm": 8.689617156982422, + "learning_rate": 2.291851566316996e-05, + "loss": 2.7953, + "step": 5227000 + }, + { + "epoch": 1.6250444924902894, + "grad_norm": 10.574773788452148, + "learning_rate": 2.2915925125161844e-05, + "loss": 2.7798, + "step": 5227500 + }, + { + "epoch": 1.6251999247707762, + "grad_norm": 9.316084861755371, + "learning_rate": 2.291333458715373e-05, + "loss": 2.7794, + "step": 5228000 + }, + { + "epoch": 1.625355357051263, + "grad_norm": 10.595969200134277, + "learning_rate": 2.2910744049145615e-05, + "loss": 2.8132, + "step": 5228500 + }, + { + "epoch": 1.62551078933175, + "grad_norm": 14.180503845214844, + "learning_rate": 2.2908153511137502e-05, + "loss": 2.7658, + "step": 5229000 + }, + { + "epoch": 1.6256662216122368, + "grad_norm": 10.774663925170898, + "learning_rate": 2.2905562973129386e-05, + "loss": 2.7454, + "step": 5229500 + }, + { + "epoch": 1.6258216538927237, + "grad_norm": 16.3324031829834, + "learning_rate": 2.2902972435121273e-05, + "loss": 2.7514, + "step": 5230000 + }, + { + "epoch": 1.6259770861732106, + "grad_norm": 9.993391990661621, + "learning_rate": 2.2900381897113157e-05, + "loss": 2.7707, + "step": 5230500 + }, + { + "epoch": 1.6261325184536974, + "grad_norm": 33.62067794799805, + "learning_rate": 2.289779135910504e-05, + "loss": 2.752, + "step": 5231000 + }, + { + "epoch": 1.6262879507341843, + "grad_norm": 10.689526557922363, + "learning_rate": 2.2895200821096928e-05, + "loss": 2.801, + "step": 5231500 + }, + { + "epoch": 1.6264433830146712, + "grad_norm": 10.570228576660156, + "learning_rate": 2.2892610283088815e-05, + "loss": 2.724, + "step": 5232000 + }, + { + "epoch": 1.626598815295158, + "grad_norm": 8.129054069519043, + "learning_rate": 2.28900197450807e-05, + "loss": 2.7498, + "step": 5232500 + }, + { + "epoch": 1.626754247575645, + "grad_norm": 9.964072227478027, + "learning_rate": 2.2887429207072582e-05, + "loss": 2.7661, + "step": 5233000 + }, + { + "epoch": 1.6269096798561318, + "grad_norm": 9.274250984191895, + "learning_rate": 2.288483866906447e-05, + "loss": 2.7413, + "step": 5233500 + }, + { + "epoch": 1.6270651121366186, + "grad_norm": 11.5326509475708, + "learning_rate": 2.2882248131056357e-05, + "loss": 2.7368, + "step": 5234000 + }, + { + "epoch": 1.6272205444171055, + "grad_norm": 8.44141960144043, + "learning_rate": 2.287965759304824e-05, + "loss": 2.7514, + "step": 5234500 + }, + { + "epoch": 1.6273759766975924, + "grad_norm": 8.501402854919434, + "learning_rate": 2.2877067055040128e-05, + "loss": 2.785, + "step": 5235000 + }, + { + "epoch": 1.6275314089780792, + "grad_norm": 19.27275848388672, + "learning_rate": 2.287447651703201e-05, + "loss": 2.7588, + "step": 5235500 + }, + { + "epoch": 1.627686841258566, + "grad_norm": 10.120408058166504, + "learning_rate": 2.2871885979023895e-05, + "loss": 2.7389, + "step": 5236000 + }, + { + "epoch": 1.627842273539053, + "grad_norm": 9.873895645141602, + "learning_rate": 2.2869295441015782e-05, + "loss": 2.7827, + "step": 5236500 + }, + { + "epoch": 1.62799770581954, + "grad_norm": 30.834545135498047, + "learning_rate": 2.286670490300767e-05, + "loss": 2.7847, + "step": 5237000 + }, + { + "epoch": 1.628153138100027, + "grad_norm": 6.947516441345215, + "learning_rate": 2.2864114364999553e-05, + "loss": 2.7739, + "step": 5237500 + }, + { + "epoch": 1.6283085703805138, + "grad_norm": 11.332550048828125, + "learning_rate": 2.2861523826991437e-05, + "loss": 2.7326, + "step": 5238000 + }, + { + "epoch": 1.6284640026610007, + "grad_norm": 6.4523234367370605, + "learning_rate": 2.285893328898332e-05, + "loss": 2.7609, + "step": 5238500 + }, + { + "epoch": 1.6286194349414875, + "grad_norm": 8.573729515075684, + "learning_rate": 2.285634275097521e-05, + "loss": 2.7956, + "step": 5239000 + }, + { + "epoch": 1.6287748672219744, + "grad_norm": 8.057080268859863, + "learning_rate": 2.2853752212967095e-05, + "loss": 2.7322, + "step": 5239500 + }, + { + "epoch": 1.6289302995024613, + "grad_norm": 9.873427391052246, + "learning_rate": 2.285116167495898e-05, + "loss": 2.7422, + "step": 5240000 + }, + { + "epoch": 1.6290857317829481, + "grad_norm": 14.816413879394531, + "learning_rate": 2.2848571136950866e-05, + "loss": 2.7786, + "step": 5240500 + }, + { + "epoch": 1.629241164063435, + "grad_norm": 10.810369491577148, + "learning_rate": 2.284598059894275e-05, + "loss": 2.7582, + "step": 5241000 + }, + { + "epoch": 1.6293965963439219, + "grad_norm": 12.296606063842773, + "learning_rate": 2.2843390060934637e-05, + "loss": 2.7687, + "step": 5241500 + }, + { + "epoch": 1.6295520286244087, + "grad_norm": 8.26504898071289, + "learning_rate": 2.284079952292652e-05, + "loss": 2.7253, + "step": 5242000 + }, + { + "epoch": 1.6297074609048956, + "grad_norm": 8.332923889160156, + "learning_rate": 2.2838208984918408e-05, + "loss": 2.7324, + "step": 5242500 + }, + { + "epoch": 1.6298628931853827, + "grad_norm": 17.419937133789062, + "learning_rate": 2.283561844691029e-05, + "loss": 2.7014, + "step": 5243000 + }, + { + "epoch": 1.6300183254658696, + "grad_norm": 9.855897903442383, + "learning_rate": 2.2833027908902175e-05, + "loss": 2.7164, + "step": 5243500 + }, + { + "epoch": 1.6301737577463564, + "grad_norm": 13.786953926086426, + "learning_rate": 2.2830437370894066e-05, + "loss": 2.7826, + "step": 5244000 + }, + { + "epoch": 1.6303291900268433, + "grad_norm": 10.037250518798828, + "learning_rate": 2.282784683288595e-05, + "loss": 2.7534, + "step": 5244500 + }, + { + "epoch": 1.6304846223073302, + "grad_norm": 11.63634204864502, + "learning_rate": 2.2825256294877833e-05, + "loss": 2.7661, + "step": 5245000 + }, + { + "epoch": 1.630640054587817, + "grad_norm": 6.395200252532959, + "learning_rate": 2.2822665756869717e-05, + "loss": 2.7595, + "step": 5245500 + }, + { + "epoch": 1.630795486868304, + "grad_norm": 11.531866073608398, + "learning_rate": 2.2820075218861604e-05, + "loss": 2.7557, + "step": 5246000 + }, + { + "epoch": 1.6309509191487908, + "grad_norm": 9.926621437072754, + "learning_rate": 2.281748468085349e-05, + "loss": 2.7673, + "step": 5246500 + }, + { + "epoch": 1.6311063514292776, + "grad_norm": 72.25942993164062, + "learning_rate": 2.2814894142845375e-05, + "loss": 2.7627, + "step": 5247000 + }, + { + "epoch": 1.6312617837097645, + "grad_norm": 8.87840747833252, + "learning_rate": 2.281230360483726e-05, + "loss": 2.7922, + "step": 5247500 + }, + { + "epoch": 1.6314172159902514, + "grad_norm": 9.616312980651855, + "learning_rate": 2.2809713066829146e-05, + "loss": 2.7868, + "step": 5248000 + }, + { + "epoch": 1.6315726482707382, + "grad_norm": 17.065387725830078, + "learning_rate": 2.280712252882103e-05, + "loss": 2.7174, + "step": 5248500 + }, + { + "epoch": 1.631728080551225, + "grad_norm": 11.115771293640137, + "learning_rate": 2.2804531990812917e-05, + "loss": 2.6996, + "step": 5249000 + }, + { + "epoch": 1.631883512831712, + "grad_norm": 9.580097198486328, + "learning_rate": 2.2801941452804804e-05, + "loss": 2.7717, + "step": 5249500 + }, + { + "epoch": 1.6320389451121988, + "grad_norm": 10.512418746948242, + "learning_rate": 2.2799350914796688e-05, + "loss": 2.7296, + "step": 5250000 + }, + { + "epoch": 1.6321943773926857, + "grad_norm": 18.095287322998047, + "learning_rate": 2.2796760376788572e-05, + "loss": 2.7993, + "step": 5250500 + }, + { + "epoch": 1.6323498096731726, + "grad_norm": 8.108040809631348, + "learning_rate": 2.279416983878046e-05, + "loss": 2.6947, + "step": 5251000 + }, + { + "epoch": 1.6325052419536594, + "grad_norm": 8.744894981384277, + "learning_rate": 2.2791579300772346e-05, + "loss": 2.7139, + "step": 5251500 + }, + { + "epoch": 1.6326606742341463, + "grad_norm": 8.321329116821289, + "learning_rate": 2.278898876276423e-05, + "loss": 2.766, + "step": 5252000 + }, + { + "epoch": 1.6328161065146332, + "grad_norm": 11.13243293762207, + "learning_rate": 2.2786398224756114e-05, + "loss": 2.7538, + "step": 5252500 + }, + { + "epoch": 1.63297153879512, + "grad_norm": 10.43362045288086, + "learning_rate": 2.2783807686748e-05, + "loss": 2.7431, + "step": 5253000 + }, + { + "epoch": 1.633126971075607, + "grad_norm": 11.96624755859375, + "learning_rate": 2.2781217148739885e-05, + "loss": 2.7803, + "step": 5253500 + }, + { + "epoch": 1.6332824033560938, + "grad_norm": 7.90164852142334, + "learning_rate": 2.2778626610731772e-05, + "loss": 2.8078, + "step": 5254000 + }, + { + "epoch": 1.6334378356365806, + "grad_norm": 11.096426963806152, + "learning_rate": 2.2776036072723656e-05, + "loss": 2.718, + "step": 5254500 + }, + { + "epoch": 1.6335932679170675, + "grad_norm": 9.583487510681152, + "learning_rate": 2.2773445534715543e-05, + "loss": 2.7845, + "step": 5255000 + }, + { + "epoch": 1.6337487001975544, + "grad_norm": 17.3078670501709, + "learning_rate": 2.2770854996707426e-05, + "loss": 2.7629, + "step": 5255500 + }, + { + "epoch": 1.6339041324780412, + "grad_norm": 9.428930282592773, + "learning_rate": 2.2768264458699314e-05, + "loss": 2.7656, + "step": 5256000 + }, + { + "epoch": 1.634059564758528, + "grad_norm": 7.348720550537109, + "learning_rate": 2.2765673920691197e-05, + "loss": 2.7454, + "step": 5256500 + }, + { + "epoch": 1.634214997039015, + "grad_norm": 10.936031341552734, + "learning_rate": 2.2763083382683085e-05, + "loss": 2.8025, + "step": 5257000 + }, + { + "epoch": 1.6343704293195018, + "grad_norm": 9.4026517868042, + "learning_rate": 2.2760492844674968e-05, + "loss": 2.73, + "step": 5257500 + }, + { + "epoch": 1.6345258615999887, + "grad_norm": 8.548824310302734, + "learning_rate": 2.2757902306666852e-05, + "loss": 2.7617, + "step": 5258000 + }, + { + "epoch": 1.6346812938804756, + "grad_norm": 7.956847190856934, + "learning_rate": 2.275531176865874e-05, + "loss": 2.7665, + "step": 5258500 + }, + { + "epoch": 1.6348367261609624, + "grad_norm": 13.05648136138916, + "learning_rate": 2.2752721230650626e-05, + "loss": 2.7516, + "step": 5259000 + }, + { + "epoch": 1.6349921584414493, + "grad_norm": 7.826237201690674, + "learning_rate": 2.275013069264251e-05, + "loss": 2.7693, + "step": 5259500 + }, + { + "epoch": 1.6351475907219362, + "grad_norm": 10.332470893859863, + "learning_rate": 2.2747540154634394e-05, + "loss": 2.7847, + "step": 5260000 + }, + { + "epoch": 1.635303023002423, + "grad_norm": 8.447418212890625, + "learning_rate": 2.274494961662628e-05, + "loss": 2.7568, + "step": 5260500 + }, + { + "epoch": 1.6354584552829101, + "grad_norm": 15.972578048706055, + "learning_rate": 2.2742359078618168e-05, + "loss": 2.7518, + "step": 5261000 + }, + { + "epoch": 1.635613887563397, + "grad_norm": 10.87665843963623, + "learning_rate": 2.2739768540610052e-05, + "loss": 2.7411, + "step": 5261500 + }, + { + "epoch": 1.6357693198438839, + "grad_norm": 10.136238098144531, + "learning_rate": 2.273717800260194e-05, + "loss": 2.7417, + "step": 5262000 + }, + { + "epoch": 1.6359247521243707, + "grad_norm": 10.190889358520508, + "learning_rate": 2.2734587464593823e-05, + "loss": 2.7639, + "step": 5262500 + }, + { + "epoch": 1.6360801844048576, + "grad_norm": 8.079415321350098, + "learning_rate": 2.2731996926585707e-05, + "loss": 2.7721, + "step": 5263000 + }, + { + "epoch": 1.6362356166853445, + "grad_norm": 8.532559394836426, + "learning_rate": 2.2729406388577594e-05, + "loss": 2.7323, + "step": 5263500 + }, + { + "epoch": 1.6363910489658313, + "grad_norm": 10.258855819702148, + "learning_rate": 2.272681585056948e-05, + "loss": 2.7515, + "step": 5264000 + }, + { + "epoch": 1.6365464812463182, + "grad_norm": 9.020101547241211, + "learning_rate": 2.2724225312561365e-05, + "loss": 2.7633, + "step": 5264500 + }, + { + "epoch": 1.636701913526805, + "grad_norm": 10.12193489074707, + "learning_rate": 2.272163477455325e-05, + "loss": 2.7525, + "step": 5265000 + }, + { + "epoch": 1.636857345807292, + "grad_norm": 8.011967658996582, + "learning_rate": 2.2719044236545132e-05, + "loss": 2.7356, + "step": 5265500 + }, + { + "epoch": 1.6370127780877788, + "grad_norm": 9.446487426757812, + "learning_rate": 2.2716453698537023e-05, + "loss": 2.7474, + "step": 5266000 + }, + { + "epoch": 1.6371682103682657, + "grad_norm": 12.031342506408691, + "learning_rate": 2.2713863160528907e-05, + "loss": 2.792, + "step": 5266500 + }, + { + "epoch": 1.6373236426487527, + "grad_norm": 8.190587997436523, + "learning_rate": 2.271127262252079e-05, + "loss": 2.7229, + "step": 5267000 + }, + { + "epoch": 1.6374790749292396, + "grad_norm": 10.579652786254883, + "learning_rate": 2.2708682084512677e-05, + "loss": 2.7379, + "step": 5267500 + }, + { + "epoch": 1.6376345072097265, + "grad_norm": 8.919604301452637, + "learning_rate": 2.270609154650456e-05, + "loss": 2.7729, + "step": 5268000 + }, + { + "epoch": 1.6377899394902133, + "grad_norm": 9.708200454711914, + "learning_rate": 2.270350100849645e-05, + "loss": 2.7546, + "step": 5268500 + }, + { + "epoch": 1.6379453717707002, + "grad_norm": 8.459854125976562, + "learning_rate": 2.2700910470488332e-05, + "loss": 2.7078, + "step": 5269000 + }, + { + "epoch": 1.638100804051187, + "grad_norm": 10.311699867248535, + "learning_rate": 2.269831993248022e-05, + "loss": 2.7419, + "step": 5269500 + }, + { + "epoch": 1.638256236331674, + "grad_norm": 7.692590713500977, + "learning_rate": 2.2695729394472103e-05, + "loss": 2.7436, + "step": 5270000 + }, + { + "epoch": 1.6384116686121608, + "grad_norm": 10.045785903930664, + "learning_rate": 2.269313885646399e-05, + "loss": 2.7542, + "step": 5270500 + }, + { + "epoch": 1.6385671008926477, + "grad_norm": 17.155893325805664, + "learning_rate": 2.2690548318455877e-05, + "loss": 2.7545, + "step": 5271000 + }, + { + "epoch": 1.6387225331731345, + "grad_norm": 33.55492401123047, + "learning_rate": 2.268795778044776e-05, + "loss": 2.7216, + "step": 5271500 + }, + { + "epoch": 1.6388779654536214, + "grad_norm": 9.426265716552734, + "learning_rate": 2.2685367242439645e-05, + "loss": 2.7567, + "step": 5272000 + }, + { + "epoch": 1.6390333977341083, + "grad_norm": 15.902740478515625, + "learning_rate": 2.268277670443153e-05, + "loss": 2.7603, + "step": 5272500 + }, + { + "epoch": 1.6391888300145951, + "grad_norm": 12.856039047241211, + "learning_rate": 2.2680186166423416e-05, + "loss": 2.745, + "step": 5273000 + }, + { + "epoch": 1.639344262295082, + "grad_norm": 9.715386390686035, + "learning_rate": 2.2677595628415303e-05, + "loss": 2.7344, + "step": 5273500 + }, + { + "epoch": 1.6394996945755689, + "grad_norm": 11.999160766601562, + "learning_rate": 2.2675005090407187e-05, + "loss": 2.6908, + "step": 5274000 + }, + { + "epoch": 1.6396551268560557, + "grad_norm": 8.500818252563477, + "learning_rate": 2.267241455239907e-05, + "loss": 2.8155, + "step": 5274500 + }, + { + "epoch": 1.6398105591365426, + "grad_norm": 8.356919288635254, + "learning_rate": 2.2669824014390958e-05, + "loss": 2.7708, + "step": 5275000 + }, + { + "epoch": 1.6399659914170295, + "grad_norm": 9.231110572814941, + "learning_rate": 2.2667233476382845e-05, + "loss": 2.7386, + "step": 5275500 + }, + { + "epoch": 1.6401214236975163, + "grad_norm": 15.819430351257324, + "learning_rate": 2.266464293837473e-05, + "loss": 2.7309, + "step": 5276000 + }, + { + "epoch": 1.6402768559780032, + "grad_norm": 9.72925853729248, + "learning_rate": 2.2662052400366616e-05, + "loss": 2.725, + "step": 5276500 + }, + { + "epoch": 1.64043228825849, + "grad_norm": 10.086098670959473, + "learning_rate": 2.26594618623585e-05, + "loss": 2.7533, + "step": 5277000 + }, + { + "epoch": 1.640587720538977, + "grad_norm": 11.314196586608887, + "learning_rate": 2.2656871324350383e-05, + "loss": 2.6978, + "step": 5277500 + }, + { + "epoch": 1.6407431528194638, + "grad_norm": 8.574586868286133, + "learning_rate": 2.265428078634227e-05, + "loss": 2.7135, + "step": 5278000 + }, + { + "epoch": 1.6408985850999507, + "grad_norm": 10.524843215942383, + "learning_rate": 2.2651690248334158e-05, + "loss": 2.7669, + "step": 5278500 + }, + { + "epoch": 1.6410540173804375, + "grad_norm": 32.52348709106445, + "learning_rate": 2.264909971032604e-05, + "loss": 2.7188, + "step": 5279000 + }, + { + "epoch": 1.6412094496609244, + "grad_norm": 7.685974597930908, + "learning_rate": 2.2646509172317925e-05, + "loss": 2.7787, + "step": 5279500 + }, + { + "epoch": 1.6413648819414113, + "grad_norm": 9.843809127807617, + "learning_rate": 2.2643918634309812e-05, + "loss": 2.7592, + "step": 5280000 + }, + { + "epoch": 1.6415203142218981, + "grad_norm": 9.859452247619629, + "learning_rate": 2.26413280963017e-05, + "loss": 2.7612, + "step": 5280500 + }, + { + "epoch": 1.641675746502385, + "grad_norm": 13.434051513671875, + "learning_rate": 2.2638737558293583e-05, + "loss": 2.775, + "step": 5281000 + }, + { + "epoch": 1.6418311787828719, + "grad_norm": 6.330623626708984, + "learning_rate": 2.2636147020285467e-05, + "loss": 2.7639, + "step": 5281500 + }, + { + "epoch": 1.6419866110633587, + "grad_norm": 7.999519348144531, + "learning_rate": 2.2633556482277354e-05, + "loss": 2.7394, + "step": 5282000 + }, + { + "epoch": 1.6421420433438456, + "grad_norm": 19.968931198120117, + "learning_rate": 2.2630965944269238e-05, + "loss": 2.8078, + "step": 5282500 + }, + { + "epoch": 1.6422974756243325, + "grad_norm": 9.71213436126709, + "learning_rate": 2.2628375406261125e-05, + "loss": 2.7855, + "step": 5283000 + }, + { + "epoch": 1.6424529079048193, + "grad_norm": 17.25717544555664, + "learning_rate": 2.262578486825301e-05, + "loss": 2.7627, + "step": 5283500 + }, + { + "epoch": 1.6426083401853062, + "grad_norm": 8.924691200256348, + "learning_rate": 2.2623194330244896e-05, + "loss": 2.7746, + "step": 5284000 + }, + { + "epoch": 1.642763772465793, + "grad_norm": 11.271258354187012, + "learning_rate": 2.262060379223678e-05, + "loss": 2.7527, + "step": 5284500 + }, + { + "epoch": 1.6429192047462802, + "grad_norm": 16.5033016204834, + "learning_rate": 2.2618013254228663e-05, + "loss": 2.7589, + "step": 5285000 + }, + { + "epoch": 1.643074637026767, + "grad_norm": 9.447420120239258, + "learning_rate": 2.2615422716220554e-05, + "loss": 2.7543, + "step": 5285500 + }, + { + "epoch": 1.643230069307254, + "grad_norm": 16.139617919921875, + "learning_rate": 2.2612832178212438e-05, + "loss": 2.7411, + "step": 5286000 + }, + { + "epoch": 1.6433855015877408, + "grad_norm": 9.098784446716309, + "learning_rate": 2.261024164020432e-05, + "loss": 2.7605, + "step": 5286500 + }, + { + "epoch": 1.6435409338682276, + "grad_norm": 21.918537139892578, + "learning_rate": 2.2607651102196205e-05, + "loss": 2.7198, + "step": 5287000 + }, + { + "epoch": 1.6436963661487145, + "grad_norm": 8.697933197021484, + "learning_rate": 2.2605060564188092e-05, + "loss": 2.7612, + "step": 5287500 + }, + { + "epoch": 1.6438517984292014, + "grad_norm": 8.728720664978027, + "learning_rate": 2.260247002617998e-05, + "loss": 2.7461, + "step": 5288000 + }, + { + "epoch": 1.6440072307096882, + "grad_norm": 9.68999195098877, + "learning_rate": 2.2599879488171863e-05, + "loss": 2.7163, + "step": 5288500 + }, + { + "epoch": 1.644162662990175, + "grad_norm": 10.177478790283203, + "learning_rate": 2.259728895016375e-05, + "loss": 2.7037, + "step": 5289000 + }, + { + "epoch": 1.644318095270662, + "grad_norm": 9.962031364440918, + "learning_rate": 2.2594698412155634e-05, + "loss": 2.7699, + "step": 5289500 + }, + { + "epoch": 1.6444735275511488, + "grad_norm": 10.051238059997559, + "learning_rate": 2.2592107874147518e-05, + "loss": 2.7275, + "step": 5290000 + }, + { + "epoch": 1.6446289598316357, + "grad_norm": 12.663701057434082, + "learning_rate": 2.2589517336139405e-05, + "loss": 2.7341, + "step": 5290500 + }, + { + "epoch": 1.6447843921121228, + "grad_norm": 10.559642791748047, + "learning_rate": 2.2586926798131292e-05, + "loss": 2.777, + "step": 5291000 + }, + { + "epoch": 1.6449398243926097, + "grad_norm": 9.533018112182617, + "learning_rate": 2.2584336260123176e-05, + "loss": 2.7958, + "step": 5291500 + }, + { + "epoch": 1.6450952566730965, + "grad_norm": 9.575366973876953, + "learning_rate": 2.258174572211506e-05, + "loss": 2.854, + "step": 5292000 + }, + { + "epoch": 1.6452506889535834, + "grad_norm": 9.241612434387207, + "learning_rate": 2.2579155184106947e-05, + "loss": 2.7572, + "step": 5292500 + }, + { + "epoch": 1.6454061212340703, + "grad_norm": 7.798429012298584, + "learning_rate": 2.2576564646098834e-05, + "loss": 2.7331, + "step": 5293000 + }, + { + "epoch": 1.6455615535145571, + "grad_norm": 15.808497428894043, + "learning_rate": 2.2573974108090718e-05, + "loss": 2.7583, + "step": 5293500 + }, + { + "epoch": 1.645716985795044, + "grad_norm": 10.010919570922852, + "learning_rate": 2.2571383570082602e-05, + "loss": 2.7257, + "step": 5294000 + }, + { + "epoch": 1.6458724180755309, + "grad_norm": 8.244704246520996, + "learning_rate": 2.256879303207449e-05, + "loss": 2.7189, + "step": 5294500 + }, + { + "epoch": 1.6460278503560177, + "grad_norm": 19.1842098236084, + "learning_rate": 2.2566202494066373e-05, + "loss": 2.7321, + "step": 5295000 + }, + { + "epoch": 1.6461832826365046, + "grad_norm": 10.032052040100098, + "learning_rate": 2.256361195605826e-05, + "loss": 2.7812, + "step": 5295500 + }, + { + "epoch": 1.6463387149169915, + "grad_norm": 9.926689147949219, + "learning_rate": 2.2561021418050144e-05, + "loss": 2.7393, + "step": 5296000 + }, + { + "epoch": 1.6464941471974783, + "grad_norm": 9.905220031738281, + "learning_rate": 2.255843088004203e-05, + "loss": 2.7765, + "step": 5296500 + }, + { + "epoch": 1.6466495794779652, + "grad_norm": 17.26708221435547, + "learning_rate": 2.2555840342033914e-05, + "loss": 2.7429, + "step": 5297000 + }, + { + "epoch": 1.646805011758452, + "grad_norm": 9.069711685180664, + "learning_rate": 2.25532498040258e-05, + "loss": 2.7632, + "step": 5297500 + }, + { + "epoch": 1.646960444038939, + "grad_norm": 8.197015762329102, + "learning_rate": 2.255065926601769e-05, + "loss": 2.7891, + "step": 5298000 + }, + { + "epoch": 1.6471158763194258, + "grad_norm": 10.173219680786133, + "learning_rate": 2.2548068728009573e-05, + "loss": 2.7404, + "step": 5298500 + }, + { + "epoch": 1.6472713085999127, + "grad_norm": 10.366761207580566, + "learning_rate": 2.2545478190001456e-05, + "loss": 2.7285, + "step": 5299000 + }, + { + "epoch": 1.6474267408803995, + "grad_norm": 9.019426345825195, + "learning_rate": 2.254288765199334e-05, + "loss": 2.8031, + "step": 5299500 + }, + { + "epoch": 1.6475821731608864, + "grad_norm": 12.099949836730957, + "learning_rate": 2.2540297113985227e-05, + "loss": 2.7714, + "step": 5300000 + }, + { + "epoch": 1.6477376054413733, + "grad_norm": 11.221392631530762, + "learning_rate": 2.2537706575977114e-05, + "loss": 2.7978, + "step": 5300500 + }, + { + "epoch": 1.6478930377218601, + "grad_norm": 10.088279724121094, + "learning_rate": 2.2535116037968998e-05, + "loss": 2.7206, + "step": 5301000 + }, + { + "epoch": 1.648048470002347, + "grad_norm": 8.3203706741333, + "learning_rate": 2.2532525499960882e-05, + "loss": 2.7503, + "step": 5301500 + }, + { + "epoch": 1.6482039022828339, + "grad_norm": 18.34029197692871, + "learning_rate": 2.252993496195277e-05, + "loss": 2.7384, + "step": 5302000 + }, + { + "epoch": 1.6483593345633207, + "grad_norm": 10.44705867767334, + "learning_rate": 2.2527344423944656e-05, + "loss": 2.7757, + "step": 5302500 + }, + { + "epoch": 1.6485147668438076, + "grad_norm": 8.97336196899414, + "learning_rate": 2.252475388593654e-05, + "loss": 2.7759, + "step": 5303000 + }, + { + "epoch": 1.6486701991242945, + "grad_norm": 8.554375648498535, + "learning_rate": 2.2522163347928427e-05, + "loss": 2.7704, + "step": 5303500 + }, + { + "epoch": 1.6488256314047813, + "grad_norm": 9.55093765258789, + "learning_rate": 2.251957280992031e-05, + "loss": 2.755, + "step": 5304000 + }, + { + "epoch": 1.6489810636852682, + "grad_norm": 10.601434707641602, + "learning_rate": 2.2516982271912195e-05, + "loss": 2.7566, + "step": 5304500 + }, + { + "epoch": 1.649136495965755, + "grad_norm": 9.114293098449707, + "learning_rate": 2.2514391733904082e-05, + "loss": 2.7675, + "step": 5305000 + }, + { + "epoch": 1.649291928246242, + "grad_norm": 12.438447952270508, + "learning_rate": 2.251180119589597e-05, + "loss": 2.7367, + "step": 5305500 + }, + { + "epoch": 1.6494473605267288, + "grad_norm": 10.201552391052246, + "learning_rate": 2.2509210657887853e-05, + "loss": 2.721, + "step": 5306000 + }, + { + "epoch": 1.6496027928072157, + "grad_norm": 6.925745487213135, + "learning_rate": 2.2506620119879737e-05, + "loss": 2.7796, + "step": 5306500 + }, + { + "epoch": 1.6497582250877025, + "grad_norm": 13.22119426727295, + "learning_rate": 2.2504029581871624e-05, + "loss": 2.8216, + "step": 5307000 + }, + { + "epoch": 1.6499136573681894, + "grad_norm": 9.35433578491211, + "learning_rate": 2.250143904386351e-05, + "loss": 2.7194, + "step": 5307500 + }, + { + "epoch": 1.6500690896486763, + "grad_norm": 5.534494876861572, + "learning_rate": 2.2498848505855395e-05, + "loss": 2.7431, + "step": 5308000 + }, + { + "epoch": 1.6502245219291631, + "grad_norm": 10.27292537689209, + "learning_rate": 2.249625796784728e-05, + "loss": 2.7588, + "step": 5308500 + }, + { + "epoch": 1.6503799542096502, + "grad_norm": 6.370461463928223, + "learning_rate": 2.2493667429839165e-05, + "loss": 2.7523, + "step": 5309000 + }, + { + "epoch": 1.650535386490137, + "grad_norm": 8.849328994750977, + "learning_rate": 2.249107689183105e-05, + "loss": 2.7921, + "step": 5309500 + }, + { + "epoch": 1.650690818770624, + "grad_norm": 8.63145923614502, + "learning_rate": 2.2488486353822936e-05, + "loss": 2.7095, + "step": 5310000 + }, + { + "epoch": 1.6508462510511108, + "grad_norm": 8.953980445861816, + "learning_rate": 2.2485895815814824e-05, + "loss": 2.7665, + "step": 5310500 + }, + { + "epoch": 1.6510016833315977, + "grad_norm": 16.792020797729492, + "learning_rate": 2.2483305277806707e-05, + "loss": 2.7593, + "step": 5311000 + }, + { + "epoch": 1.6511571156120846, + "grad_norm": 10.566951751708984, + "learning_rate": 2.248071473979859e-05, + "loss": 2.7512, + "step": 5311500 + }, + { + "epoch": 1.6513125478925714, + "grad_norm": 8.877185821533203, + "learning_rate": 2.2478124201790475e-05, + "loss": 2.7654, + "step": 5312000 + }, + { + "epoch": 1.6514679801730583, + "grad_norm": 12.302571296691895, + "learning_rate": 2.2475533663782365e-05, + "loss": 2.7414, + "step": 5312500 + }, + { + "epoch": 1.6516234124535452, + "grad_norm": 10.576798439025879, + "learning_rate": 2.247294312577425e-05, + "loss": 2.7644, + "step": 5313000 + }, + { + "epoch": 1.651778844734032, + "grad_norm": 8.828859329223633, + "learning_rate": 2.2470352587766133e-05, + "loss": 2.6752, + "step": 5313500 + }, + { + "epoch": 1.651934277014519, + "grad_norm": 7.965207576751709, + "learning_rate": 2.2467762049758017e-05, + "loss": 2.7535, + "step": 5314000 + }, + { + "epoch": 1.6520897092950058, + "grad_norm": 34.599578857421875, + "learning_rate": 2.2465171511749904e-05, + "loss": 2.7244, + "step": 5314500 + }, + { + "epoch": 1.6522451415754926, + "grad_norm": 11.322369575500488, + "learning_rate": 2.246258097374179e-05, + "loss": 2.7787, + "step": 5315000 + }, + { + "epoch": 1.6524005738559797, + "grad_norm": 14.81467056274414, + "learning_rate": 2.2459990435733675e-05, + "loss": 2.7598, + "step": 5315500 + }, + { + "epoch": 1.6525560061364666, + "grad_norm": 8.435686111450195, + "learning_rate": 2.2457399897725562e-05, + "loss": 2.7691, + "step": 5316000 + }, + { + "epoch": 1.6527114384169534, + "grad_norm": 16.71601676940918, + "learning_rate": 2.2454809359717446e-05, + "loss": 2.7316, + "step": 5316500 + }, + { + "epoch": 1.6528668706974403, + "grad_norm": 8.47723388671875, + "learning_rate": 2.245221882170933e-05, + "loss": 2.7275, + "step": 5317000 + }, + { + "epoch": 1.6530223029779272, + "grad_norm": 12.22095775604248, + "learning_rate": 2.2449628283701217e-05, + "loss": 2.7813, + "step": 5317500 + }, + { + "epoch": 1.653177735258414, + "grad_norm": 10.743239402770996, + "learning_rate": 2.2447037745693104e-05, + "loss": 2.7148, + "step": 5318000 + }, + { + "epoch": 1.653333167538901, + "grad_norm": 7.872711658477783, + "learning_rate": 2.2444447207684988e-05, + "loss": 2.7428, + "step": 5318500 + }, + { + "epoch": 1.6534885998193878, + "grad_norm": 11.15896987915039, + "learning_rate": 2.244185666967687e-05, + "loss": 2.769, + "step": 5319000 + }, + { + "epoch": 1.6536440320998747, + "grad_norm": 15.114326477050781, + "learning_rate": 2.243926613166876e-05, + "loss": 2.7571, + "step": 5319500 + }, + { + "epoch": 1.6537994643803615, + "grad_norm": 9.385165214538574, + "learning_rate": 2.2436675593660646e-05, + "loss": 2.7257, + "step": 5320000 + }, + { + "epoch": 1.6539548966608484, + "grad_norm": 9.775071144104004, + "learning_rate": 2.243408505565253e-05, + "loss": 2.7455, + "step": 5320500 + }, + { + "epoch": 1.6541103289413353, + "grad_norm": 9.421754837036133, + "learning_rate": 2.2431494517644413e-05, + "loss": 2.7356, + "step": 5321000 + }, + { + "epoch": 1.6542657612218221, + "grad_norm": 10.892203330993652, + "learning_rate": 2.24289039796363e-05, + "loss": 2.7474, + "step": 5321500 + }, + { + "epoch": 1.654421193502309, + "grad_norm": 12.174942970275879, + "learning_rate": 2.2426313441628184e-05, + "loss": 2.7514, + "step": 5322000 + }, + { + "epoch": 1.6545766257827959, + "grad_norm": 9.844952583312988, + "learning_rate": 2.242372290362007e-05, + "loss": 2.729, + "step": 5322500 + }, + { + "epoch": 1.6547320580632827, + "grad_norm": 8.239181518554688, + "learning_rate": 2.2421132365611955e-05, + "loss": 2.7489, + "step": 5323000 + }, + { + "epoch": 1.6548874903437696, + "grad_norm": 15.144679069519043, + "learning_rate": 2.2418541827603842e-05, + "loss": 2.761, + "step": 5323500 + }, + { + "epoch": 1.6550429226242565, + "grad_norm": 9.286270141601562, + "learning_rate": 2.2415951289595726e-05, + "loss": 2.7799, + "step": 5324000 + }, + { + "epoch": 1.6551983549047433, + "grad_norm": 33.33597183227539, + "learning_rate": 2.2413360751587613e-05, + "loss": 2.777, + "step": 5324500 + }, + { + "epoch": 1.6553537871852302, + "grad_norm": 9.697218894958496, + "learning_rate": 2.24107702135795e-05, + "loss": 2.799, + "step": 5325000 + }, + { + "epoch": 1.655509219465717, + "grad_norm": 9.211929321289062, + "learning_rate": 2.2408179675571384e-05, + "loss": 2.759, + "step": 5325500 + }, + { + "epoch": 1.655664651746204, + "grad_norm": 7.880693435668945, + "learning_rate": 2.2405589137563268e-05, + "loss": 2.7831, + "step": 5326000 + }, + { + "epoch": 1.6558200840266908, + "grad_norm": 10.039965629577637, + "learning_rate": 2.240299859955515e-05, + "loss": 2.7306, + "step": 5326500 + }, + { + "epoch": 1.6559755163071777, + "grad_norm": 18.55646514892578, + "learning_rate": 2.240040806154704e-05, + "loss": 2.7783, + "step": 5327000 + }, + { + "epoch": 1.6561309485876645, + "grad_norm": 10.546341896057129, + "learning_rate": 2.2397817523538926e-05, + "loss": 2.7157, + "step": 5327500 + }, + { + "epoch": 1.6562863808681514, + "grad_norm": 8.425714492797852, + "learning_rate": 2.239522698553081e-05, + "loss": 2.7419, + "step": 5328000 + }, + { + "epoch": 1.6564418131486383, + "grad_norm": 10.324603080749512, + "learning_rate": 2.2392636447522697e-05, + "loss": 2.7394, + "step": 5328500 + }, + { + "epoch": 1.6565972454291251, + "grad_norm": 8.933425903320312, + "learning_rate": 2.239004590951458e-05, + "loss": 2.7572, + "step": 5329000 + }, + { + "epoch": 1.656752677709612, + "grad_norm": 37.830955505371094, + "learning_rate": 2.2387455371506468e-05, + "loss": 2.8096, + "step": 5329500 + }, + { + "epoch": 1.6569081099900989, + "grad_norm": 17.29912757873535, + "learning_rate": 2.238486483349835e-05, + "loss": 2.7935, + "step": 5330000 + }, + { + "epoch": 1.6570635422705857, + "grad_norm": 10.159836769104004, + "learning_rate": 2.238227429549024e-05, + "loss": 2.6732, + "step": 5330500 + }, + { + "epoch": 1.6572189745510726, + "grad_norm": 8.95885181427002, + "learning_rate": 2.2379683757482122e-05, + "loss": 2.7388, + "step": 5331000 + }, + { + "epoch": 1.6573744068315595, + "grad_norm": 8.267448425292969, + "learning_rate": 2.2377093219474006e-05, + "loss": 2.7976, + "step": 5331500 + }, + { + "epoch": 1.6575298391120463, + "grad_norm": 10.93437671661377, + "learning_rate": 2.2374502681465893e-05, + "loss": 2.7562, + "step": 5332000 + }, + { + "epoch": 1.6576852713925332, + "grad_norm": 12.275898933410645, + "learning_rate": 2.237191214345778e-05, + "loss": 2.7426, + "step": 5332500 + }, + { + "epoch": 1.65784070367302, + "grad_norm": 32.65596008300781, + "learning_rate": 2.2369321605449664e-05, + "loss": 2.7609, + "step": 5333000 + }, + { + "epoch": 1.6579961359535071, + "grad_norm": 12.707484245300293, + "learning_rate": 2.2366731067441548e-05, + "loss": 2.7277, + "step": 5333500 + }, + { + "epoch": 1.658151568233994, + "grad_norm": 45.83620834350586, + "learning_rate": 2.2364140529433435e-05, + "loss": 2.7676, + "step": 5334000 + }, + { + "epoch": 1.6583070005144809, + "grad_norm": 8.373863220214844, + "learning_rate": 2.2361549991425322e-05, + "loss": 2.744, + "step": 5334500 + }, + { + "epoch": 1.6584624327949677, + "grad_norm": 26.30175018310547, + "learning_rate": 2.2358959453417206e-05, + "loss": 2.7565, + "step": 5335000 + }, + { + "epoch": 1.6586178650754546, + "grad_norm": 12.16161060333252, + "learning_rate": 2.235636891540909e-05, + "loss": 2.7633, + "step": 5335500 + }, + { + "epoch": 1.6587732973559415, + "grad_norm": 17.55962371826172, + "learning_rate": 2.2353778377400977e-05, + "loss": 2.745, + "step": 5336000 + }, + { + "epoch": 1.6589287296364283, + "grad_norm": 9.696568489074707, + "learning_rate": 2.235118783939286e-05, + "loss": 2.7608, + "step": 5336500 + }, + { + "epoch": 1.6590841619169152, + "grad_norm": 7.431654930114746, + "learning_rate": 2.2348597301384748e-05, + "loss": 2.7725, + "step": 5337000 + }, + { + "epoch": 1.659239594197402, + "grad_norm": 15.80117130279541, + "learning_rate": 2.2346006763376635e-05, + "loss": 2.7486, + "step": 5337500 + }, + { + "epoch": 1.659395026477889, + "grad_norm": 9.356834411621094, + "learning_rate": 2.234341622536852e-05, + "loss": 2.748, + "step": 5338000 + }, + { + "epoch": 1.6595504587583758, + "grad_norm": 17.110490798950195, + "learning_rate": 2.2340825687360403e-05, + "loss": 2.7317, + "step": 5338500 + }, + { + "epoch": 1.6597058910388627, + "grad_norm": 8.739042282104492, + "learning_rate": 2.2338235149352286e-05, + "loss": 2.7488, + "step": 5339000 + }, + { + "epoch": 1.6598613233193498, + "grad_norm": 26.86712074279785, + "learning_rate": 2.2335644611344177e-05, + "loss": 2.7761, + "step": 5339500 + }, + { + "epoch": 1.6600167555998366, + "grad_norm": 8.610396385192871, + "learning_rate": 2.233305407333606e-05, + "loss": 2.7595, + "step": 5340000 + }, + { + "epoch": 1.6601721878803235, + "grad_norm": 8.694355964660645, + "learning_rate": 2.2330463535327944e-05, + "loss": 2.7738, + "step": 5340500 + }, + { + "epoch": 1.6603276201608104, + "grad_norm": 10.888955116271973, + "learning_rate": 2.2327872997319828e-05, + "loss": 2.7864, + "step": 5341000 + }, + { + "epoch": 1.6604830524412972, + "grad_norm": 10.975061416625977, + "learning_rate": 2.2325282459311715e-05, + "loss": 2.755, + "step": 5341500 + }, + { + "epoch": 1.660638484721784, + "grad_norm": 9.95356273651123, + "learning_rate": 2.2322691921303602e-05, + "loss": 2.8125, + "step": 5342000 + }, + { + "epoch": 1.660793917002271, + "grad_norm": 11.320221900939941, + "learning_rate": 2.2320101383295486e-05, + "loss": 2.8227, + "step": 5342500 + }, + { + "epoch": 1.6609493492827578, + "grad_norm": 7.601110935211182, + "learning_rate": 2.2317510845287373e-05, + "loss": 2.7165, + "step": 5343000 + }, + { + "epoch": 1.6611047815632447, + "grad_norm": 9.282325744628906, + "learning_rate": 2.2314920307279257e-05, + "loss": 2.8187, + "step": 5343500 + }, + { + "epoch": 1.6612602138437316, + "grad_norm": 8.570667266845703, + "learning_rate": 2.231232976927114e-05, + "loss": 2.7486, + "step": 5344000 + }, + { + "epoch": 1.6614156461242184, + "grad_norm": 10.829822540283203, + "learning_rate": 2.2309739231263028e-05, + "loss": 2.7899, + "step": 5344500 + }, + { + "epoch": 1.6615710784047053, + "grad_norm": 9.086175918579102, + "learning_rate": 2.2307148693254915e-05, + "loss": 2.7847, + "step": 5345000 + }, + { + "epoch": 1.6617265106851922, + "grad_norm": 10.140761375427246, + "learning_rate": 2.23045581552468e-05, + "loss": 2.7726, + "step": 5345500 + }, + { + "epoch": 1.661881942965679, + "grad_norm": 7.559652328491211, + "learning_rate": 2.2301967617238683e-05, + "loss": 2.8098, + "step": 5346000 + }, + { + "epoch": 1.662037375246166, + "grad_norm": 8.867650985717773, + "learning_rate": 2.229937707923057e-05, + "loss": 2.778, + "step": 5346500 + }, + { + "epoch": 1.6621928075266528, + "grad_norm": 9.60360336303711, + "learning_rate": 2.2296786541222457e-05, + "loss": 2.7753, + "step": 5347000 + }, + { + "epoch": 1.6623482398071396, + "grad_norm": 10.944526672363281, + "learning_rate": 2.229419600321434e-05, + "loss": 2.7268, + "step": 5347500 + }, + { + "epoch": 1.6625036720876265, + "grad_norm": 7.243895530700684, + "learning_rate": 2.2291605465206225e-05, + "loss": 2.7333, + "step": 5348000 + }, + { + "epoch": 1.6626591043681134, + "grad_norm": 17.54973030090332, + "learning_rate": 2.2289014927198112e-05, + "loss": 2.7084, + "step": 5348500 + }, + { + "epoch": 1.6628145366486002, + "grad_norm": 10.230396270751953, + "learning_rate": 2.2286424389189995e-05, + "loss": 2.7377, + "step": 5349000 + }, + { + "epoch": 1.662969968929087, + "grad_norm": 10.763652801513672, + "learning_rate": 2.2283833851181883e-05, + "loss": 2.7643, + "step": 5349500 + }, + { + "epoch": 1.663125401209574, + "grad_norm": 9.572582244873047, + "learning_rate": 2.2281243313173766e-05, + "loss": 2.7478, + "step": 5350000 + }, + { + "epoch": 1.6632808334900608, + "grad_norm": 8.727718353271484, + "learning_rate": 2.2278652775165654e-05, + "loss": 2.7335, + "step": 5350500 + }, + { + "epoch": 1.6634362657705477, + "grad_norm": 10.95338249206543, + "learning_rate": 2.2276062237157537e-05, + "loss": 2.7651, + "step": 5351000 + }, + { + "epoch": 1.6635916980510346, + "grad_norm": 6.420540809631348, + "learning_rate": 2.2273471699149424e-05, + "loss": 2.7528, + "step": 5351500 + }, + { + "epoch": 1.6637471303315214, + "grad_norm": 25.264848709106445, + "learning_rate": 2.227088116114131e-05, + "loss": 2.8047, + "step": 5352000 + }, + { + "epoch": 1.6639025626120083, + "grad_norm": 10.712063789367676, + "learning_rate": 2.2268290623133195e-05, + "loss": 2.7821, + "step": 5352500 + }, + { + "epoch": 1.6640579948924952, + "grad_norm": 9.631033897399902, + "learning_rate": 2.226570008512508e-05, + "loss": 2.781, + "step": 5353000 + }, + { + "epoch": 1.664213427172982, + "grad_norm": 9.105058670043945, + "learning_rate": 2.2263109547116963e-05, + "loss": 2.7281, + "step": 5353500 + }, + { + "epoch": 1.664368859453469, + "grad_norm": 9.552000999450684, + "learning_rate": 2.226051900910885e-05, + "loss": 2.7642, + "step": 5354000 + }, + { + "epoch": 1.6645242917339558, + "grad_norm": 9.008387565612793, + "learning_rate": 2.2257928471100737e-05, + "loss": 2.7645, + "step": 5354500 + }, + { + "epoch": 1.6646797240144426, + "grad_norm": 10.500658988952637, + "learning_rate": 2.225533793309262e-05, + "loss": 2.8016, + "step": 5355000 + }, + { + "epoch": 1.6648351562949295, + "grad_norm": 9.82056999206543, + "learning_rate": 2.2252747395084508e-05, + "loss": 2.7322, + "step": 5355500 + }, + { + "epoch": 1.6649905885754164, + "grad_norm": 10.2846040725708, + "learning_rate": 2.2250156857076392e-05, + "loss": 2.7645, + "step": 5356000 + }, + { + "epoch": 1.6651460208559032, + "grad_norm": 9.968180656433105, + "learning_rate": 2.224756631906828e-05, + "loss": 2.7778, + "step": 5356500 + }, + { + "epoch": 1.66530145313639, + "grad_norm": 11.734237670898438, + "learning_rate": 2.2244975781060163e-05, + "loss": 2.689, + "step": 5357000 + }, + { + "epoch": 1.6654568854168772, + "grad_norm": 6.328516006469727, + "learning_rate": 2.224238524305205e-05, + "loss": 2.739, + "step": 5357500 + }, + { + "epoch": 1.665612317697364, + "grad_norm": 10.147895812988281, + "learning_rate": 2.2239794705043934e-05, + "loss": 2.695, + "step": 5358000 + }, + { + "epoch": 1.665767749977851, + "grad_norm": 8.815143585205078, + "learning_rate": 2.2237204167035817e-05, + "loss": 2.6884, + "step": 5358500 + }, + { + "epoch": 1.6659231822583378, + "grad_norm": 10.822271347045898, + "learning_rate": 2.2234613629027705e-05, + "loss": 2.7643, + "step": 5359000 + }, + { + "epoch": 1.6660786145388247, + "grad_norm": 8.918590545654297, + "learning_rate": 2.2232023091019592e-05, + "loss": 2.7437, + "step": 5359500 + }, + { + "epoch": 1.6662340468193115, + "grad_norm": 10.684674263000488, + "learning_rate": 2.2229432553011476e-05, + "loss": 2.7575, + "step": 5360000 + }, + { + "epoch": 1.6663894790997984, + "grad_norm": 8.2843599319458, + "learning_rate": 2.222684201500336e-05, + "loss": 2.7498, + "step": 5360500 + }, + { + "epoch": 1.6665449113802853, + "grad_norm": 7.06508207321167, + "learning_rate": 2.2224251476995246e-05, + "loss": 2.7407, + "step": 5361000 + }, + { + "epoch": 1.6667003436607721, + "grad_norm": 10.889261245727539, + "learning_rate": 2.2221660938987134e-05, + "loss": 2.7352, + "step": 5361500 + }, + { + "epoch": 1.666855775941259, + "grad_norm": 9.206955909729004, + "learning_rate": 2.2219070400979017e-05, + "loss": 2.7737, + "step": 5362000 + }, + { + "epoch": 1.6670112082217459, + "grad_norm": 10.094037055969238, + "learning_rate": 2.22164798629709e-05, + "loss": 2.7211, + "step": 5362500 + }, + { + "epoch": 1.6671666405022327, + "grad_norm": 14.043983459472656, + "learning_rate": 2.221388932496279e-05, + "loss": 2.7617, + "step": 5363000 + }, + { + "epoch": 1.6673220727827198, + "grad_norm": 17.0690975189209, + "learning_rate": 2.2211298786954672e-05, + "loss": 2.7105, + "step": 5363500 + }, + { + "epoch": 1.6674775050632067, + "grad_norm": 10.993936538696289, + "learning_rate": 2.220870824894656e-05, + "loss": 2.7804, + "step": 5364000 + }, + { + "epoch": 1.6676329373436936, + "grad_norm": 11.355155944824219, + "learning_rate": 2.2206117710938446e-05, + "loss": 2.7677, + "step": 5364500 + }, + { + "epoch": 1.6677883696241804, + "grad_norm": 8.162689208984375, + "learning_rate": 2.220352717293033e-05, + "loss": 2.7861, + "step": 5365000 + }, + { + "epoch": 1.6679438019046673, + "grad_norm": 12.044848442077637, + "learning_rate": 2.2200936634922214e-05, + "loss": 2.728, + "step": 5365500 + }, + { + "epoch": 1.6680992341851542, + "grad_norm": 7.988921642303467, + "learning_rate": 2.21983460969141e-05, + "loss": 2.7894, + "step": 5366000 + }, + { + "epoch": 1.668254666465641, + "grad_norm": 9.834904670715332, + "learning_rate": 2.2195755558905988e-05, + "loss": 2.7409, + "step": 5366500 + }, + { + "epoch": 1.668410098746128, + "grad_norm": 8.196542739868164, + "learning_rate": 2.2193165020897872e-05, + "loss": 2.6924, + "step": 5367000 + }, + { + "epoch": 1.6685655310266148, + "grad_norm": 12.12973690032959, + "learning_rate": 2.2190574482889756e-05, + "loss": 2.7699, + "step": 5367500 + }, + { + "epoch": 1.6687209633071016, + "grad_norm": 10.010697364807129, + "learning_rate": 2.218798394488164e-05, + "loss": 2.7411, + "step": 5368000 + }, + { + "epoch": 1.6688763955875885, + "grad_norm": 9.870973587036133, + "learning_rate": 2.2185393406873527e-05, + "loss": 2.7076, + "step": 5368500 + }, + { + "epoch": 1.6690318278680754, + "grad_norm": 10.997772216796875, + "learning_rate": 2.2182802868865414e-05, + "loss": 2.7701, + "step": 5369000 + }, + { + "epoch": 1.6691872601485622, + "grad_norm": 19.2208194732666, + "learning_rate": 2.2180212330857298e-05, + "loss": 2.7369, + "step": 5369500 + }, + { + "epoch": 1.669342692429049, + "grad_norm": 12.077685356140137, + "learning_rate": 2.2177621792849185e-05, + "loss": 2.7116, + "step": 5370000 + }, + { + "epoch": 1.669498124709536, + "grad_norm": 8.840202331542969, + "learning_rate": 2.217503125484107e-05, + "loss": 2.758, + "step": 5370500 + }, + { + "epoch": 1.6696535569900228, + "grad_norm": 7.749340534210205, + "learning_rate": 2.2172440716832956e-05, + "loss": 2.7333, + "step": 5371000 + }, + { + "epoch": 1.6698089892705097, + "grad_norm": 6.940791130065918, + "learning_rate": 2.216985017882484e-05, + "loss": 2.7048, + "step": 5371500 + }, + { + "epoch": 1.6699644215509966, + "grad_norm": 8.993863105773926, + "learning_rate": 2.2167259640816727e-05, + "loss": 2.7862, + "step": 5372000 + }, + { + "epoch": 1.6701198538314834, + "grad_norm": 7.797881603240967, + "learning_rate": 2.216466910280861e-05, + "loss": 2.7865, + "step": 5372500 + }, + { + "epoch": 1.6702752861119703, + "grad_norm": 9.283221244812012, + "learning_rate": 2.2162078564800494e-05, + "loss": 2.7922, + "step": 5373000 + }, + { + "epoch": 1.6704307183924572, + "grad_norm": 7.647520542144775, + "learning_rate": 2.215948802679238e-05, + "loss": 2.7419, + "step": 5373500 + }, + { + "epoch": 1.670586150672944, + "grad_norm": 12.135144233703613, + "learning_rate": 2.215689748878427e-05, + "loss": 2.7488, + "step": 5374000 + }, + { + "epoch": 1.670741582953431, + "grad_norm": 10.976385116577148, + "learning_rate": 2.2154306950776152e-05, + "loss": 2.8244, + "step": 5374500 + }, + { + "epoch": 1.6708970152339178, + "grad_norm": 8.761839866638184, + "learning_rate": 2.2151716412768036e-05, + "loss": 2.7634, + "step": 5375000 + }, + { + "epoch": 1.6710524475144046, + "grad_norm": 15.048534393310547, + "learning_rate": 2.2149125874759923e-05, + "loss": 2.759, + "step": 5375500 + }, + { + "epoch": 1.6712078797948915, + "grad_norm": 11.99161148071289, + "learning_rate": 2.214653533675181e-05, + "loss": 2.7321, + "step": 5376000 + }, + { + "epoch": 1.6713633120753784, + "grad_norm": 12.282313346862793, + "learning_rate": 2.2143944798743694e-05, + "loss": 2.7484, + "step": 5376500 + }, + { + "epoch": 1.6715187443558652, + "grad_norm": 7.40659761428833, + "learning_rate": 2.2141354260735578e-05, + "loss": 2.7627, + "step": 5377000 + }, + { + "epoch": 1.671674176636352, + "grad_norm": 10.235517501831055, + "learning_rate": 2.2138763722727465e-05, + "loss": 2.7301, + "step": 5377500 + }, + { + "epoch": 1.671829608916839, + "grad_norm": 9.783831596374512, + "learning_rate": 2.213617318471935e-05, + "loss": 2.7791, + "step": 5378000 + }, + { + "epoch": 1.6719850411973258, + "grad_norm": 10.315239906311035, + "learning_rate": 2.2133582646711236e-05, + "loss": 2.7067, + "step": 5378500 + }, + { + "epoch": 1.6721404734778127, + "grad_norm": 7.344583988189697, + "learning_rate": 2.2130992108703123e-05, + "loss": 2.7899, + "step": 5379000 + }, + { + "epoch": 1.6722959057582996, + "grad_norm": 14.129172325134277, + "learning_rate": 2.2128401570695007e-05, + "loss": 2.7584, + "step": 5379500 + }, + { + "epoch": 1.6724513380387864, + "grad_norm": 10.945125579833984, + "learning_rate": 2.212581103268689e-05, + "loss": 2.7551, + "step": 5380000 + }, + { + "epoch": 1.6726067703192733, + "grad_norm": 8.136139869689941, + "learning_rate": 2.2123220494678774e-05, + "loss": 2.7861, + "step": 5380500 + }, + { + "epoch": 1.6727622025997602, + "grad_norm": 9.44048023223877, + "learning_rate": 2.2120629956670665e-05, + "loss": 2.7568, + "step": 5381000 + }, + { + "epoch": 1.6729176348802473, + "grad_norm": 11.563302993774414, + "learning_rate": 2.211803941866255e-05, + "loss": 2.7425, + "step": 5381500 + }, + { + "epoch": 1.6730730671607341, + "grad_norm": 9.090697288513184, + "learning_rate": 2.2115448880654432e-05, + "loss": 2.6676, + "step": 5382000 + }, + { + "epoch": 1.673228499441221, + "grad_norm": 18.878671646118164, + "learning_rate": 2.211285834264632e-05, + "loss": 2.7562, + "step": 5382500 + }, + { + "epoch": 1.6733839317217079, + "grad_norm": 8.897077560424805, + "learning_rate": 2.2110267804638203e-05, + "loss": 2.767, + "step": 5383000 + }, + { + "epoch": 1.6735393640021947, + "grad_norm": 8.538474082946777, + "learning_rate": 2.210767726663009e-05, + "loss": 2.7597, + "step": 5383500 + }, + { + "epoch": 1.6736947962826816, + "grad_norm": 9.443570137023926, + "learning_rate": 2.2105086728621974e-05, + "loss": 2.7719, + "step": 5384000 + }, + { + "epoch": 1.6738502285631685, + "grad_norm": 18.182254791259766, + "learning_rate": 2.210249619061386e-05, + "loss": 2.7749, + "step": 5384500 + }, + { + "epoch": 1.6740056608436553, + "grad_norm": 8.071422576904297, + "learning_rate": 2.2099905652605745e-05, + "loss": 2.7176, + "step": 5385000 + }, + { + "epoch": 1.6741610931241422, + "grad_norm": 10.96365737915039, + "learning_rate": 2.209731511459763e-05, + "loss": 2.7501, + "step": 5385500 + }, + { + "epoch": 1.674316525404629, + "grad_norm": 9.74303150177002, + "learning_rate": 2.2094724576589516e-05, + "loss": 2.763, + "step": 5386000 + }, + { + "epoch": 1.674471957685116, + "grad_norm": 13.078845977783203, + "learning_rate": 2.2092134038581403e-05, + "loss": 2.7455, + "step": 5386500 + }, + { + "epoch": 1.6746273899656028, + "grad_norm": 13.724837303161621, + "learning_rate": 2.2089543500573287e-05, + "loss": 2.761, + "step": 5387000 + }, + { + "epoch": 1.6747828222460899, + "grad_norm": 11.946564674377441, + "learning_rate": 2.208695296256517e-05, + "loss": 2.7775, + "step": 5387500 + }, + { + "epoch": 1.6749382545265767, + "grad_norm": 8.506447792053223, + "learning_rate": 2.2084362424557058e-05, + "loss": 2.7585, + "step": 5388000 + }, + { + "epoch": 1.6750936868070636, + "grad_norm": 11.785419464111328, + "learning_rate": 2.2081771886548945e-05, + "loss": 2.722, + "step": 5388500 + }, + { + "epoch": 1.6752491190875505, + "grad_norm": 10.54719352722168, + "learning_rate": 2.207918134854083e-05, + "loss": 2.759, + "step": 5389000 + }, + { + "epoch": 1.6754045513680373, + "grad_norm": 12.03770637512207, + "learning_rate": 2.2076590810532713e-05, + "loss": 2.7095, + "step": 5389500 + }, + { + "epoch": 1.6755599836485242, + "grad_norm": 8.30015754699707, + "learning_rate": 2.20740002725246e-05, + "loss": 2.6684, + "step": 5390000 + }, + { + "epoch": 1.675715415929011, + "grad_norm": 8.845145225524902, + "learning_rate": 2.2071409734516484e-05, + "loss": 2.7098, + "step": 5390500 + }, + { + "epoch": 1.675870848209498, + "grad_norm": 8.845921516418457, + "learning_rate": 2.206881919650837e-05, + "loss": 2.7883, + "step": 5391000 + }, + { + "epoch": 1.6760262804899848, + "grad_norm": 8.296984672546387, + "learning_rate": 2.2066228658500258e-05, + "loss": 2.7653, + "step": 5391500 + }, + { + "epoch": 1.6761817127704717, + "grad_norm": 12.772007942199707, + "learning_rate": 2.206363812049214e-05, + "loss": 2.7173, + "step": 5392000 + }, + { + "epoch": 1.6763371450509585, + "grad_norm": 10.083342552185059, + "learning_rate": 2.2061047582484025e-05, + "loss": 2.803, + "step": 5392500 + }, + { + "epoch": 1.6764925773314454, + "grad_norm": 13.786587715148926, + "learning_rate": 2.2058457044475912e-05, + "loss": 2.755, + "step": 5393000 + }, + { + "epoch": 1.6766480096119323, + "grad_norm": 9.267627716064453, + "learning_rate": 2.20558665064678e-05, + "loss": 2.7545, + "step": 5393500 + }, + { + "epoch": 1.6768034418924191, + "grad_norm": 10.002193450927734, + "learning_rate": 2.2053275968459683e-05, + "loss": 2.7744, + "step": 5394000 + }, + { + "epoch": 1.676958874172906, + "grad_norm": 9.547591209411621, + "learning_rate": 2.2050685430451567e-05, + "loss": 2.7175, + "step": 5394500 + }, + { + "epoch": 1.6771143064533929, + "grad_norm": 8.459555625915527, + "learning_rate": 2.204809489244345e-05, + "loss": 2.7882, + "step": 5395000 + }, + { + "epoch": 1.6772697387338797, + "grad_norm": 13.046350479125977, + "learning_rate": 2.2045504354435338e-05, + "loss": 2.7387, + "step": 5395500 + }, + { + "epoch": 1.6774251710143666, + "grad_norm": 17.497934341430664, + "learning_rate": 2.2042913816427225e-05, + "loss": 2.7418, + "step": 5396000 + }, + { + "epoch": 1.6775806032948535, + "grad_norm": 18.16010093688965, + "learning_rate": 2.204032327841911e-05, + "loss": 2.7548, + "step": 5396500 + }, + { + "epoch": 1.6777360355753403, + "grad_norm": 8.801715850830078, + "learning_rate": 2.2037732740410996e-05, + "loss": 2.7436, + "step": 5397000 + }, + { + "epoch": 1.6778914678558272, + "grad_norm": 9.803305625915527, + "learning_rate": 2.203514220240288e-05, + "loss": 2.738, + "step": 5397500 + }, + { + "epoch": 1.678046900136314, + "grad_norm": 27.312580108642578, + "learning_rate": 2.2032551664394767e-05, + "loss": 2.7399, + "step": 5398000 + }, + { + "epoch": 1.678202332416801, + "grad_norm": 12.007728576660156, + "learning_rate": 2.202996112638665e-05, + "loss": 2.7656, + "step": 5398500 + }, + { + "epoch": 1.6783577646972878, + "grad_norm": 9.471892356872559, + "learning_rate": 2.2027370588378538e-05, + "loss": 2.7209, + "step": 5399000 + }, + { + "epoch": 1.6785131969777747, + "grad_norm": 12.617435455322266, + "learning_rate": 2.2024780050370422e-05, + "loss": 2.7402, + "step": 5399500 + }, + { + "epoch": 1.6786686292582615, + "grad_norm": 9.049778938293457, + "learning_rate": 2.2022189512362306e-05, + "loss": 2.7561, + "step": 5400000 + }, + { + "epoch": 1.6788240615387484, + "grad_norm": 9.79090690612793, + "learning_rate": 2.2019598974354193e-05, + "loss": 2.7589, + "step": 5400500 + }, + { + "epoch": 1.6789794938192353, + "grad_norm": 10.521366119384766, + "learning_rate": 2.201700843634608e-05, + "loss": 2.7456, + "step": 5401000 + }, + { + "epoch": 1.6791349260997221, + "grad_norm": 9.094416618347168, + "learning_rate": 2.2014417898337964e-05, + "loss": 2.6834, + "step": 5401500 + }, + { + "epoch": 1.679290358380209, + "grad_norm": 10.7474365234375, + "learning_rate": 2.2011827360329847e-05, + "loss": 2.7297, + "step": 5402000 + }, + { + "epoch": 1.6794457906606959, + "grad_norm": 10.388806343078613, + "learning_rate": 2.2009236822321735e-05, + "loss": 2.7659, + "step": 5402500 + }, + { + "epoch": 1.6796012229411827, + "grad_norm": 11.230626106262207, + "learning_rate": 2.200664628431362e-05, + "loss": 2.7091, + "step": 5403000 + }, + { + "epoch": 1.6797566552216696, + "grad_norm": 8.767033576965332, + "learning_rate": 2.2004055746305505e-05, + "loss": 2.7008, + "step": 5403500 + }, + { + "epoch": 1.6799120875021565, + "grad_norm": 7.86704683303833, + "learning_rate": 2.200146520829739e-05, + "loss": 2.7832, + "step": 5404000 + }, + { + "epoch": 1.6800675197826433, + "grad_norm": 15.532984733581543, + "learning_rate": 2.1998874670289276e-05, + "loss": 2.7842, + "step": 5404500 + }, + { + "epoch": 1.6802229520631302, + "grad_norm": 10.386643409729004, + "learning_rate": 2.199628413228116e-05, + "loss": 2.7897, + "step": 5405000 + }, + { + "epoch": 1.6803783843436173, + "grad_norm": 16.483976364135742, + "learning_rate": 2.1993693594273047e-05, + "loss": 2.7786, + "step": 5405500 + }, + { + "epoch": 1.6805338166241042, + "grad_norm": 8.956363677978516, + "learning_rate": 2.1991103056264934e-05, + "loss": 2.7626, + "step": 5406000 + }, + { + "epoch": 1.680689248904591, + "grad_norm": 9.2637939453125, + "learning_rate": 2.1988512518256818e-05, + "loss": 2.7656, + "step": 5406500 + }, + { + "epoch": 1.680844681185078, + "grad_norm": 8.893937110900879, + "learning_rate": 2.1985921980248702e-05, + "loss": 2.7582, + "step": 5407000 + }, + { + "epoch": 1.6810001134655648, + "grad_norm": 8.653496742248535, + "learning_rate": 2.1983331442240586e-05, + "loss": 2.8045, + "step": 5407500 + }, + { + "epoch": 1.6811555457460516, + "grad_norm": 9.668142318725586, + "learning_rate": 2.1980740904232476e-05, + "loss": 2.7598, + "step": 5408000 + }, + { + "epoch": 1.6813109780265385, + "grad_norm": 49.53631591796875, + "learning_rate": 2.197815036622436e-05, + "loss": 2.7196, + "step": 5408500 + }, + { + "epoch": 1.6814664103070254, + "grad_norm": 40.90023422241211, + "learning_rate": 2.1975559828216244e-05, + "loss": 2.7143, + "step": 5409000 + }, + { + "epoch": 1.6816218425875122, + "grad_norm": 9.059181213378906, + "learning_rate": 2.197296929020813e-05, + "loss": 2.7092, + "step": 5409500 + }, + { + "epoch": 1.681777274867999, + "grad_norm": 9.221723556518555, + "learning_rate": 2.1970378752200015e-05, + "loss": 2.765, + "step": 5410000 + }, + { + "epoch": 1.681932707148486, + "grad_norm": 8.490478515625, + "learning_rate": 2.1967788214191902e-05, + "loss": 2.7163, + "step": 5410500 + }, + { + "epoch": 1.6820881394289728, + "grad_norm": 10.160165786743164, + "learning_rate": 2.1965197676183786e-05, + "loss": 2.7235, + "step": 5411000 + }, + { + "epoch": 1.68224357170946, + "grad_norm": 20.047597885131836, + "learning_rate": 2.1962607138175673e-05, + "loss": 2.8094, + "step": 5411500 + }, + { + "epoch": 1.6823990039899468, + "grad_norm": 9.37185287475586, + "learning_rate": 2.1960016600167557e-05, + "loss": 2.6951, + "step": 5412000 + }, + { + "epoch": 1.6825544362704337, + "grad_norm": 7.8676018714904785, + "learning_rate": 2.195742606215944e-05, + "loss": 2.7817, + "step": 5412500 + }, + { + "epoch": 1.6827098685509205, + "grad_norm": 12.600003242492676, + "learning_rate": 2.1954835524151327e-05, + "loss": 2.7403, + "step": 5413000 + }, + { + "epoch": 1.6828653008314074, + "grad_norm": 12.365649223327637, + "learning_rate": 2.1952244986143215e-05, + "loss": 2.7266, + "step": 5413500 + }, + { + "epoch": 1.6830207331118943, + "grad_norm": 7.521510124206543, + "learning_rate": 2.19496544481351e-05, + "loss": 2.7496, + "step": 5414000 + }, + { + "epoch": 1.6831761653923811, + "grad_norm": 9.02755069732666, + "learning_rate": 2.1947063910126982e-05, + "loss": 2.7435, + "step": 5414500 + }, + { + "epoch": 1.683331597672868, + "grad_norm": 9.333996772766113, + "learning_rate": 2.194447337211887e-05, + "loss": 2.7716, + "step": 5415000 + }, + { + "epoch": 1.6834870299533549, + "grad_norm": 17.03403091430664, + "learning_rate": 2.1941882834110756e-05, + "loss": 2.7455, + "step": 5415500 + }, + { + "epoch": 1.6836424622338417, + "grad_norm": 13.622600555419922, + "learning_rate": 2.193929229610264e-05, + "loss": 2.7478, + "step": 5416000 + }, + { + "epoch": 1.6837978945143286, + "grad_norm": 11.325119018554688, + "learning_rate": 2.1936701758094524e-05, + "loss": 2.797, + "step": 5416500 + }, + { + "epoch": 1.6839533267948155, + "grad_norm": 15.245068550109863, + "learning_rate": 2.193411122008641e-05, + "loss": 2.6926, + "step": 5417000 + }, + { + "epoch": 1.6841087590753023, + "grad_norm": 21.018396377563477, + "learning_rate": 2.1931520682078295e-05, + "loss": 2.7412, + "step": 5417500 + }, + { + "epoch": 1.6842641913557892, + "grad_norm": 17.27669906616211, + "learning_rate": 2.1928930144070182e-05, + "loss": 2.7293, + "step": 5418000 + }, + { + "epoch": 1.684419623636276, + "grad_norm": 12.377429008483887, + "learning_rate": 2.192633960606207e-05, + "loss": 2.7375, + "step": 5418500 + }, + { + "epoch": 1.684575055916763, + "grad_norm": 8.443852424621582, + "learning_rate": 2.1923749068053953e-05, + "loss": 2.6853, + "step": 5419000 + }, + { + "epoch": 1.6847304881972498, + "grad_norm": 7.9462199211120605, + "learning_rate": 2.1921158530045837e-05, + "loss": 2.6969, + "step": 5419500 + }, + { + "epoch": 1.6848859204777367, + "grad_norm": 10.58413314819336, + "learning_rate": 2.1918567992037724e-05, + "loss": 2.7347, + "step": 5420000 + }, + { + "epoch": 1.6850413527582235, + "grad_norm": 12.269634246826172, + "learning_rate": 2.191597745402961e-05, + "loss": 2.8035, + "step": 5420500 + }, + { + "epoch": 1.6851967850387104, + "grad_norm": 10.932692527770996, + "learning_rate": 2.1913386916021495e-05, + "loss": 2.7251, + "step": 5421000 + }, + { + "epoch": 1.6853522173191973, + "grad_norm": 11.5308837890625, + "learning_rate": 2.191079637801338e-05, + "loss": 2.7112, + "step": 5421500 + }, + { + "epoch": 1.6855076495996841, + "grad_norm": 8.654688835144043, + "learning_rate": 2.1908205840005262e-05, + "loss": 2.7377, + "step": 5422000 + }, + { + "epoch": 1.685663081880171, + "grad_norm": 8.400565147399902, + "learning_rate": 2.190561530199715e-05, + "loss": 2.7753, + "step": 5422500 + }, + { + "epoch": 1.6858185141606579, + "grad_norm": 8.267916679382324, + "learning_rate": 2.1903024763989037e-05, + "loss": 2.745, + "step": 5423000 + }, + { + "epoch": 1.6859739464411447, + "grad_norm": 9.329157829284668, + "learning_rate": 2.190043422598092e-05, + "loss": 2.7583, + "step": 5423500 + }, + { + "epoch": 1.6861293787216316, + "grad_norm": 10.376145362854004, + "learning_rate": 2.1897843687972808e-05, + "loss": 2.776, + "step": 5424000 + }, + { + "epoch": 1.6862848110021185, + "grad_norm": 12.687171936035156, + "learning_rate": 2.189525314996469e-05, + "loss": 2.7794, + "step": 5424500 + }, + { + "epoch": 1.6864402432826053, + "grad_norm": 9.61076545715332, + "learning_rate": 2.189266261195658e-05, + "loss": 2.7399, + "step": 5425000 + }, + { + "epoch": 1.6865956755630922, + "grad_norm": 30.875991821289062, + "learning_rate": 2.1890072073948462e-05, + "loss": 2.7709, + "step": 5425500 + }, + { + "epoch": 1.686751107843579, + "grad_norm": 8.549886703491211, + "learning_rate": 2.188748153594035e-05, + "loss": 2.8001, + "step": 5426000 + }, + { + "epoch": 1.686906540124066, + "grad_norm": 9.397664070129395, + "learning_rate": 2.1884890997932233e-05, + "loss": 2.7754, + "step": 5426500 + }, + { + "epoch": 1.6870619724045528, + "grad_norm": 11.578761100769043, + "learning_rate": 2.1882300459924117e-05, + "loss": 2.717, + "step": 5427000 + }, + { + "epoch": 1.6872174046850397, + "grad_norm": 9.955902099609375, + "learning_rate": 2.1879709921916004e-05, + "loss": 2.682, + "step": 5427500 + }, + { + "epoch": 1.6873728369655265, + "grad_norm": 11.276185035705566, + "learning_rate": 2.187711938390789e-05, + "loss": 2.7894, + "step": 5428000 + }, + { + "epoch": 1.6875282692460134, + "grad_norm": 16.5047607421875, + "learning_rate": 2.1874528845899775e-05, + "loss": 2.704, + "step": 5428500 + }, + { + "epoch": 1.6876837015265003, + "grad_norm": 10.093840599060059, + "learning_rate": 2.187193830789166e-05, + "loss": 2.759, + "step": 5429000 + }, + { + "epoch": 1.6878391338069874, + "grad_norm": 12.690502166748047, + "learning_rate": 2.1869347769883546e-05, + "loss": 2.7872, + "step": 5429500 + }, + { + "epoch": 1.6879945660874742, + "grad_norm": 9.085474014282227, + "learning_rate": 2.1866757231875433e-05, + "loss": 2.7471, + "step": 5430000 + }, + { + "epoch": 1.688149998367961, + "grad_norm": 9.805253982543945, + "learning_rate": 2.1864166693867317e-05, + "loss": 2.7764, + "step": 5430500 + }, + { + "epoch": 1.688305430648448, + "grad_norm": 10.99216365814209, + "learning_rate": 2.18615761558592e-05, + "loss": 2.7152, + "step": 5431000 + }, + { + "epoch": 1.6884608629289348, + "grad_norm": 13.270184516906738, + "learning_rate": 2.1858985617851088e-05, + "loss": 2.7749, + "step": 5431500 + }, + { + "epoch": 1.6886162952094217, + "grad_norm": 12.693450927734375, + "learning_rate": 2.185639507984297e-05, + "loss": 2.742, + "step": 5432000 + }, + { + "epoch": 1.6887717274899086, + "grad_norm": 9.878926277160645, + "learning_rate": 2.185380454183486e-05, + "loss": 2.7257, + "step": 5432500 + }, + { + "epoch": 1.6889271597703954, + "grad_norm": 8.832232475280762, + "learning_rate": 2.1851214003826746e-05, + "loss": 2.7995, + "step": 5433000 + }, + { + "epoch": 1.6890825920508823, + "grad_norm": 9.922246932983398, + "learning_rate": 2.184862346581863e-05, + "loss": 2.7422, + "step": 5433500 + }, + { + "epoch": 1.6892380243313692, + "grad_norm": 10.810003280639648, + "learning_rate": 2.1846032927810513e-05, + "loss": 2.7628, + "step": 5434000 + }, + { + "epoch": 1.689393456611856, + "grad_norm": 9.36623764038086, + "learning_rate": 2.1843442389802397e-05, + "loss": 2.7363, + "step": 5434500 + }, + { + "epoch": 1.689548888892343, + "grad_norm": 9.247014045715332, + "learning_rate": 2.1840851851794288e-05, + "loss": 2.7688, + "step": 5435000 + }, + { + "epoch": 1.68970432117283, + "grad_norm": 10.097185134887695, + "learning_rate": 2.183826131378617e-05, + "loss": 2.6842, + "step": 5435500 + }, + { + "epoch": 1.6898597534533168, + "grad_norm": 7.624220848083496, + "learning_rate": 2.1835670775778055e-05, + "loss": 2.7828, + "step": 5436000 + }, + { + "epoch": 1.6900151857338037, + "grad_norm": 42.359798431396484, + "learning_rate": 2.1833080237769942e-05, + "loss": 2.7936, + "step": 5436500 + }, + { + "epoch": 1.6901706180142906, + "grad_norm": 8.21789264678955, + "learning_rate": 2.1830489699761826e-05, + "loss": 2.6984, + "step": 5437000 + }, + { + "epoch": 1.6903260502947774, + "grad_norm": 16.39474868774414, + "learning_rate": 2.1827899161753713e-05, + "loss": 2.7212, + "step": 5437500 + }, + { + "epoch": 1.6904814825752643, + "grad_norm": 19.620363235473633, + "learning_rate": 2.1825308623745597e-05, + "loss": 2.7353, + "step": 5438000 + }, + { + "epoch": 1.6906369148557512, + "grad_norm": 12.122647285461426, + "learning_rate": 2.1822718085737484e-05, + "loss": 2.7418, + "step": 5438500 + }, + { + "epoch": 1.690792347136238, + "grad_norm": 13.53941535949707, + "learning_rate": 2.1820127547729368e-05, + "loss": 2.7395, + "step": 5439000 + }, + { + "epoch": 1.690947779416725, + "grad_norm": 8.488250732421875, + "learning_rate": 2.1817537009721252e-05, + "loss": 2.7692, + "step": 5439500 + }, + { + "epoch": 1.6911032116972118, + "grad_norm": 9.570056915283203, + "learning_rate": 2.181494647171314e-05, + "loss": 2.6758, + "step": 5440000 + }, + { + "epoch": 1.6912586439776987, + "grad_norm": 10.196828842163086, + "learning_rate": 2.1812355933705026e-05, + "loss": 2.7111, + "step": 5440500 + }, + { + "epoch": 1.6914140762581855, + "grad_norm": 10.983089447021484, + "learning_rate": 2.180976539569691e-05, + "loss": 2.7434, + "step": 5441000 + }, + { + "epoch": 1.6915695085386724, + "grad_norm": 16.950054168701172, + "learning_rate": 2.1807174857688794e-05, + "loss": 2.7591, + "step": 5441500 + }, + { + "epoch": 1.6917249408191593, + "grad_norm": 9.200045585632324, + "learning_rate": 2.180458431968068e-05, + "loss": 2.6796, + "step": 5442000 + }, + { + "epoch": 1.6918803730996461, + "grad_norm": 8.359589576721191, + "learning_rate": 2.1801993781672568e-05, + "loss": 2.7726, + "step": 5442500 + }, + { + "epoch": 1.692035805380133, + "grad_norm": 21.66423988342285, + "learning_rate": 2.179940324366445e-05, + "loss": 2.744, + "step": 5443000 + }, + { + "epoch": 1.6921912376606199, + "grad_norm": 11.86441421508789, + "learning_rate": 2.1796812705656335e-05, + "loss": 2.719, + "step": 5443500 + }, + { + "epoch": 1.6923466699411067, + "grad_norm": 8.78968334197998, + "learning_rate": 2.1794222167648223e-05, + "loss": 2.7369, + "step": 5444000 + }, + { + "epoch": 1.6925021022215936, + "grad_norm": 9.865601539611816, + "learning_rate": 2.1791631629640106e-05, + "loss": 2.7234, + "step": 5444500 + }, + { + "epoch": 1.6926575345020805, + "grad_norm": 8.099620819091797, + "learning_rate": 2.1789041091631993e-05, + "loss": 2.742, + "step": 5445000 + }, + { + "epoch": 1.6928129667825673, + "grad_norm": 9.524434089660645, + "learning_rate": 2.178645055362388e-05, + "loss": 2.7117, + "step": 5445500 + }, + { + "epoch": 1.6929683990630542, + "grad_norm": 9.970251083374023, + "learning_rate": 2.1783860015615764e-05, + "loss": 2.7508, + "step": 5446000 + }, + { + "epoch": 1.693123831343541, + "grad_norm": 8.565669059753418, + "learning_rate": 2.1781269477607648e-05, + "loss": 2.7553, + "step": 5446500 + }, + { + "epoch": 1.693279263624028, + "grad_norm": 9.557374000549316, + "learning_rate": 2.1778678939599535e-05, + "loss": 2.7903, + "step": 5447000 + }, + { + "epoch": 1.6934346959045148, + "grad_norm": 8.462696075439453, + "learning_rate": 2.1776088401591422e-05, + "loss": 2.7613, + "step": 5447500 + }, + { + "epoch": 1.6935901281850017, + "grad_norm": 11.213759422302246, + "learning_rate": 2.1773497863583306e-05, + "loss": 2.7695, + "step": 5448000 + }, + { + "epoch": 1.6937455604654885, + "grad_norm": 11.395525932312012, + "learning_rate": 2.177090732557519e-05, + "loss": 2.7419, + "step": 5448500 + }, + { + "epoch": 1.6939009927459754, + "grad_norm": 9.440208435058594, + "learning_rate": 2.1768316787567077e-05, + "loss": 2.7412, + "step": 5449000 + }, + { + "epoch": 1.6940564250264623, + "grad_norm": 28.527067184448242, + "learning_rate": 2.176572624955896e-05, + "loss": 2.8073, + "step": 5449500 + }, + { + "epoch": 1.6942118573069491, + "grad_norm": 11.220565795898438, + "learning_rate": 2.1763135711550848e-05, + "loss": 2.7198, + "step": 5450000 + }, + { + "epoch": 1.694367289587436, + "grad_norm": 12.263593673706055, + "learning_rate": 2.1760545173542732e-05, + "loss": 2.7754, + "step": 5450500 + }, + { + "epoch": 1.6945227218679229, + "grad_norm": 8.28227710723877, + "learning_rate": 2.175795463553462e-05, + "loss": 2.7571, + "step": 5451000 + }, + { + "epoch": 1.6946781541484097, + "grad_norm": 13.045681953430176, + "learning_rate": 2.1755364097526503e-05, + "loss": 2.752, + "step": 5451500 + }, + { + "epoch": 1.6948335864288966, + "grad_norm": 10.734055519104004, + "learning_rate": 2.175277355951839e-05, + "loss": 2.7881, + "step": 5452000 + }, + { + "epoch": 1.6949890187093835, + "grad_norm": 8.238516807556152, + "learning_rate": 2.1750183021510274e-05, + "loss": 2.7104, + "step": 5452500 + }, + { + "epoch": 1.6951444509898703, + "grad_norm": 11.329143524169922, + "learning_rate": 2.174759248350216e-05, + "loss": 2.7196, + "step": 5453000 + }, + { + "epoch": 1.6952998832703572, + "grad_norm": 8.853720664978027, + "learning_rate": 2.1745001945494045e-05, + "loss": 2.7096, + "step": 5453500 + }, + { + "epoch": 1.6954553155508443, + "grad_norm": 9.980514526367188, + "learning_rate": 2.174241140748593e-05, + "loss": 2.7574, + "step": 5454000 + }, + { + "epoch": 1.6956107478313311, + "grad_norm": 10.392168045043945, + "learning_rate": 2.1739820869477816e-05, + "loss": 2.7042, + "step": 5454500 + }, + { + "epoch": 1.695766180111818, + "grad_norm": 9.499248504638672, + "learning_rate": 2.1737230331469703e-05, + "loss": 2.7245, + "step": 5455000 + }, + { + "epoch": 1.6959216123923049, + "grad_norm": 11.507039070129395, + "learning_rate": 2.1734639793461586e-05, + "loss": 2.6703, + "step": 5455500 + }, + { + "epoch": 1.6960770446727917, + "grad_norm": 9.012566566467285, + "learning_rate": 2.173204925545347e-05, + "loss": 2.7314, + "step": 5456000 + }, + { + "epoch": 1.6962324769532786, + "grad_norm": 15.132343292236328, + "learning_rate": 2.1729458717445357e-05, + "loss": 2.739, + "step": 5456500 + }, + { + "epoch": 1.6963879092337655, + "grad_norm": 9.03873348236084, + "learning_rate": 2.1726868179437245e-05, + "loss": 2.7566, + "step": 5457000 + }, + { + "epoch": 1.6965433415142523, + "grad_norm": 10.53211784362793, + "learning_rate": 2.1724277641429128e-05, + "loss": 2.7712, + "step": 5457500 + }, + { + "epoch": 1.6966987737947392, + "grad_norm": 19.63245964050293, + "learning_rate": 2.1721687103421015e-05, + "loss": 2.7581, + "step": 5458000 + }, + { + "epoch": 1.696854206075226, + "grad_norm": 13.746590614318848, + "learning_rate": 2.17190965654129e-05, + "loss": 2.7831, + "step": 5458500 + }, + { + "epoch": 1.697009638355713, + "grad_norm": 11.890857696533203, + "learning_rate": 2.1716506027404783e-05, + "loss": 2.7428, + "step": 5459000 + }, + { + "epoch": 1.6971650706361998, + "grad_norm": 8.462940216064453, + "learning_rate": 2.171391548939667e-05, + "loss": 2.7439, + "step": 5459500 + }, + { + "epoch": 1.697320502916687, + "grad_norm": 10.122801780700684, + "learning_rate": 2.1711324951388557e-05, + "loss": 2.7143, + "step": 5460000 + }, + { + "epoch": 1.6974759351971738, + "grad_norm": 8.75256061553955, + "learning_rate": 2.170873441338044e-05, + "loss": 2.7291, + "step": 5460500 + }, + { + "epoch": 1.6976313674776606, + "grad_norm": 9.232583045959473, + "learning_rate": 2.1706143875372325e-05, + "loss": 2.7249, + "step": 5461000 + }, + { + "epoch": 1.6977867997581475, + "grad_norm": 9.570618629455566, + "learning_rate": 2.1703553337364212e-05, + "loss": 2.7693, + "step": 5461500 + }, + { + "epoch": 1.6979422320386344, + "grad_norm": 11.427006721496582, + "learning_rate": 2.17009627993561e-05, + "loss": 2.7123, + "step": 5462000 + }, + { + "epoch": 1.6980976643191212, + "grad_norm": 10.126408576965332, + "learning_rate": 2.1698372261347983e-05, + "loss": 2.7341, + "step": 5462500 + }, + { + "epoch": 1.698253096599608, + "grad_norm": 9.170316696166992, + "learning_rate": 2.1695781723339867e-05, + "loss": 2.7388, + "step": 5463000 + }, + { + "epoch": 1.698408528880095, + "grad_norm": 13.907601356506348, + "learning_rate": 2.1693191185331754e-05, + "loss": 2.7816, + "step": 5463500 + }, + { + "epoch": 1.6985639611605818, + "grad_norm": 9.867288589477539, + "learning_rate": 2.1690600647323638e-05, + "loss": 2.7754, + "step": 5464000 + }, + { + "epoch": 1.6987193934410687, + "grad_norm": 10.34104061126709, + "learning_rate": 2.1688010109315525e-05, + "loss": 2.7519, + "step": 5464500 + }, + { + "epoch": 1.6988748257215556, + "grad_norm": 8.667000770568848, + "learning_rate": 2.168541957130741e-05, + "loss": 2.7781, + "step": 5465000 + }, + { + "epoch": 1.6990302580020424, + "grad_norm": 10.451537132263184, + "learning_rate": 2.1682829033299296e-05, + "loss": 2.7586, + "step": 5465500 + }, + { + "epoch": 1.6991856902825293, + "grad_norm": 8.990501403808594, + "learning_rate": 2.168023849529118e-05, + "loss": 2.736, + "step": 5466000 + }, + { + "epoch": 1.6993411225630162, + "grad_norm": 8.844191551208496, + "learning_rate": 2.1677647957283067e-05, + "loss": 2.7238, + "step": 5466500 + }, + { + "epoch": 1.699496554843503, + "grad_norm": 8.514260292053223, + "learning_rate": 2.1675057419274954e-05, + "loss": 2.7454, + "step": 5467000 + }, + { + "epoch": 1.69965198712399, + "grad_norm": 10.371869087219238, + "learning_rate": 2.1672466881266837e-05, + "loss": 2.7306, + "step": 5467500 + }, + { + "epoch": 1.6998074194044768, + "grad_norm": 8.660731315612793, + "learning_rate": 2.166987634325872e-05, + "loss": 2.7312, + "step": 5468000 + }, + { + "epoch": 1.6999628516849636, + "grad_norm": 8.547564506530762, + "learning_rate": 2.1667285805250605e-05, + "loss": 2.754, + "step": 5468500 + }, + { + "epoch": 1.7001182839654505, + "grad_norm": 9.103442192077637, + "learning_rate": 2.1664695267242492e-05, + "loss": 2.7664, + "step": 5469000 + }, + { + "epoch": 1.7002737162459374, + "grad_norm": 9.340608596801758, + "learning_rate": 2.166210472923438e-05, + "loss": 2.7444, + "step": 5469500 + }, + { + "epoch": 1.7004291485264242, + "grad_norm": 9.020768165588379, + "learning_rate": 2.1659514191226263e-05, + "loss": 2.7287, + "step": 5470000 + }, + { + "epoch": 1.700584580806911, + "grad_norm": 8.84559440612793, + "learning_rate": 2.1656923653218147e-05, + "loss": 2.7826, + "step": 5470500 + }, + { + "epoch": 1.700740013087398, + "grad_norm": 15.108623504638672, + "learning_rate": 2.1654333115210034e-05, + "loss": 2.7673, + "step": 5471000 + }, + { + "epoch": 1.7008954453678848, + "grad_norm": 10.739218711853027, + "learning_rate": 2.165174257720192e-05, + "loss": 2.7059, + "step": 5471500 + }, + { + "epoch": 1.7010508776483717, + "grad_norm": 9.373799324035645, + "learning_rate": 2.1649152039193805e-05, + "loss": 2.745, + "step": 5472000 + }, + { + "epoch": 1.7012063099288586, + "grad_norm": 8.808357238769531, + "learning_rate": 2.1646561501185692e-05, + "loss": 2.7395, + "step": 5472500 + }, + { + "epoch": 1.7013617422093454, + "grad_norm": 10.757094383239746, + "learning_rate": 2.1643970963177576e-05, + "loss": 2.699, + "step": 5473000 + }, + { + "epoch": 1.7015171744898323, + "grad_norm": 17.172990798950195, + "learning_rate": 2.164138042516946e-05, + "loss": 2.7846, + "step": 5473500 + }, + { + "epoch": 1.7016726067703192, + "grad_norm": 9.030585289001465, + "learning_rate": 2.1638789887161347e-05, + "loss": 2.7177, + "step": 5474000 + }, + { + "epoch": 1.701828039050806, + "grad_norm": 13.573190689086914, + "learning_rate": 2.1636199349153234e-05, + "loss": 2.7615, + "step": 5474500 + }, + { + "epoch": 1.701983471331293, + "grad_norm": 9.354765892028809, + "learning_rate": 2.1633608811145118e-05, + "loss": 2.7438, + "step": 5475000 + }, + { + "epoch": 1.7021389036117798, + "grad_norm": 9.565254211425781, + "learning_rate": 2.1631018273137e-05, + "loss": 2.7685, + "step": 5475500 + }, + { + "epoch": 1.7022943358922666, + "grad_norm": 10.826582908630371, + "learning_rate": 2.162842773512889e-05, + "loss": 2.7448, + "step": 5476000 + }, + { + "epoch": 1.7024497681727535, + "grad_norm": 12.426241874694824, + "learning_rate": 2.1625837197120776e-05, + "loss": 2.751, + "step": 5476500 + }, + { + "epoch": 1.7026052004532404, + "grad_norm": 7.951786994934082, + "learning_rate": 2.162324665911266e-05, + "loss": 2.7361, + "step": 5477000 + }, + { + "epoch": 1.7027606327337272, + "grad_norm": 13.63802719116211, + "learning_rate": 2.1620656121104543e-05, + "loss": 2.7726, + "step": 5477500 + }, + { + "epoch": 1.7029160650142143, + "grad_norm": 22.506357192993164, + "learning_rate": 2.161806558309643e-05, + "loss": 2.7671, + "step": 5478000 + }, + { + "epoch": 1.7030714972947012, + "grad_norm": 10.02541446685791, + "learning_rate": 2.1615475045088314e-05, + "loss": 2.7733, + "step": 5478500 + }, + { + "epoch": 1.703226929575188, + "grad_norm": 9.533193588256836, + "learning_rate": 2.16128845070802e-05, + "loss": 2.7441, + "step": 5479000 + }, + { + "epoch": 1.703382361855675, + "grad_norm": 10.365537643432617, + "learning_rate": 2.1610293969072085e-05, + "loss": 2.7593, + "step": 5479500 + }, + { + "epoch": 1.7035377941361618, + "grad_norm": 8.752132415771484, + "learning_rate": 2.1607703431063972e-05, + "loss": 2.7361, + "step": 5480000 + }, + { + "epoch": 1.7036932264166487, + "grad_norm": 9.479853630065918, + "learning_rate": 2.1605112893055856e-05, + "loss": 2.7376, + "step": 5480500 + }, + { + "epoch": 1.7038486586971355, + "grad_norm": 5.893409729003906, + "learning_rate": 2.160252235504774e-05, + "loss": 2.746, + "step": 5481000 + }, + { + "epoch": 1.7040040909776224, + "grad_norm": 9.649892807006836, + "learning_rate": 2.159993181703963e-05, + "loss": 2.7207, + "step": 5481500 + }, + { + "epoch": 1.7041595232581093, + "grad_norm": 11.977717399597168, + "learning_rate": 2.1597341279031514e-05, + "loss": 2.7995, + "step": 5482000 + }, + { + "epoch": 1.7043149555385961, + "grad_norm": 11.084014892578125, + "learning_rate": 2.1594750741023398e-05, + "loss": 2.7511, + "step": 5482500 + }, + { + "epoch": 1.704470387819083, + "grad_norm": 48.05304718017578, + "learning_rate": 2.159216020301528e-05, + "loss": 2.7393, + "step": 5483000 + }, + { + "epoch": 1.7046258200995699, + "grad_norm": 8.519598007202148, + "learning_rate": 2.158956966500717e-05, + "loss": 2.7332, + "step": 5483500 + }, + { + "epoch": 1.704781252380057, + "grad_norm": 8.469861030578613, + "learning_rate": 2.1586979126999056e-05, + "loss": 2.7888, + "step": 5484000 + }, + { + "epoch": 1.7049366846605438, + "grad_norm": 10.72157096862793, + "learning_rate": 2.158438858899094e-05, + "loss": 2.7318, + "step": 5484500 + }, + { + "epoch": 1.7050921169410307, + "grad_norm": 12.15922737121582, + "learning_rate": 2.1581798050982827e-05, + "loss": 2.7517, + "step": 5485000 + }, + { + "epoch": 1.7052475492215176, + "grad_norm": 7.257087707519531, + "learning_rate": 2.157920751297471e-05, + "loss": 2.6911, + "step": 5485500 + }, + { + "epoch": 1.7054029815020044, + "grad_norm": 10.115007400512695, + "learning_rate": 2.1576616974966594e-05, + "loss": 2.7047, + "step": 5486000 + }, + { + "epoch": 1.7055584137824913, + "grad_norm": 8.299212455749512, + "learning_rate": 2.157402643695848e-05, + "loss": 2.749, + "step": 5486500 + }, + { + "epoch": 1.7057138460629782, + "grad_norm": 11.895821571350098, + "learning_rate": 2.157143589895037e-05, + "loss": 2.7354, + "step": 5487000 + }, + { + "epoch": 1.705869278343465, + "grad_norm": 10.211737632751465, + "learning_rate": 2.1568845360942252e-05, + "loss": 2.7084, + "step": 5487500 + }, + { + "epoch": 1.706024710623952, + "grad_norm": 8.14029312133789, + "learning_rate": 2.1566254822934136e-05, + "loss": 2.7432, + "step": 5488000 + }, + { + "epoch": 1.7061801429044388, + "grad_norm": 11.148642539978027, + "learning_rate": 2.1563664284926023e-05, + "loss": 2.721, + "step": 5488500 + }, + { + "epoch": 1.7063355751849256, + "grad_norm": 7.903735637664795, + "learning_rate": 2.156107374691791e-05, + "loss": 2.7858, + "step": 5489000 + }, + { + "epoch": 1.7064910074654125, + "grad_norm": 9.1480131149292, + "learning_rate": 2.1558483208909794e-05, + "loss": 2.6881, + "step": 5489500 + }, + { + "epoch": 1.7066464397458994, + "grad_norm": 9.220643043518066, + "learning_rate": 2.1555892670901678e-05, + "loss": 2.7874, + "step": 5490000 + }, + { + "epoch": 1.7068018720263862, + "grad_norm": 8.811251640319824, + "learning_rate": 2.1553302132893565e-05, + "loss": 2.7332, + "step": 5490500 + }, + { + "epoch": 1.706957304306873, + "grad_norm": 10.951510429382324, + "learning_rate": 2.155071159488545e-05, + "loss": 2.762, + "step": 5491000 + }, + { + "epoch": 1.70711273658736, + "grad_norm": 9.030696868896484, + "learning_rate": 2.1548121056877336e-05, + "loss": 2.7316, + "step": 5491500 + }, + { + "epoch": 1.7072681688678468, + "grad_norm": 7.79345178604126, + "learning_rate": 2.154553051886922e-05, + "loss": 2.763, + "step": 5492000 + }, + { + "epoch": 1.7074236011483337, + "grad_norm": 8.118319511413574, + "learning_rate": 2.1542939980861107e-05, + "loss": 2.7708, + "step": 5492500 + }, + { + "epoch": 1.7075790334288206, + "grad_norm": 13.194192886352539, + "learning_rate": 2.154034944285299e-05, + "loss": 2.7737, + "step": 5493000 + }, + { + "epoch": 1.7077344657093074, + "grad_norm": 9.49919319152832, + "learning_rate": 2.1537758904844878e-05, + "loss": 2.7374, + "step": 5493500 + }, + { + "epoch": 1.7078898979897943, + "grad_norm": 9.973955154418945, + "learning_rate": 2.1535168366836765e-05, + "loss": 2.7685, + "step": 5494000 + }, + { + "epoch": 1.7080453302702812, + "grad_norm": 25.390830993652344, + "learning_rate": 2.153257782882865e-05, + "loss": 2.7429, + "step": 5494500 + }, + { + "epoch": 1.708200762550768, + "grad_norm": 8.451862335205078, + "learning_rate": 2.1529987290820533e-05, + "loss": 2.7022, + "step": 5495000 + }, + { + "epoch": 1.708356194831255, + "grad_norm": 13.57994556427002, + "learning_rate": 2.1527396752812416e-05, + "loss": 2.796, + "step": 5495500 + }, + { + "epoch": 1.7085116271117418, + "grad_norm": 9.379958152770996, + "learning_rate": 2.1524806214804304e-05, + "loss": 2.7532, + "step": 5496000 + }, + { + "epoch": 1.7086670593922286, + "grad_norm": 10.137646675109863, + "learning_rate": 2.152221567679619e-05, + "loss": 2.759, + "step": 5496500 + }, + { + "epoch": 1.7088224916727155, + "grad_norm": 7.585217475891113, + "learning_rate": 2.1519625138788074e-05, + "loss": 2.729, + "step": 5497000 + }, + { + "epoch": 1.7089779239532024, + "grad_norm": 11.835366249084473, + "learning_rate": 2.1517034600779958e-05, + "loss": 2.7605, + "step": 5497500 + }, + { + "epoch": 1.7091333562336892, + "grad_norm": 13.357677459716797, + "learning_rate": 2.1514444062771845e-05, + "loss": 2.7287, + "step": 5498000 + }, + { + "epoch": 1.709288788514176, + "grad_norm": 8.815888404846191, + "learning_rate": 2.1511853524763733e-05, + "loss": 2.7221, + "step": 5498500 + }, + { + "epoch": 1.709444220794663, + "grad_norm": 9.265544891357422, + "learning_rate": 2.1509262986755616e-05, + "loss": 2.7144, + "step": 5499000 + }, + { + "epoch": 1.7095996530751498, + "grad_norm": 9.109963417053223, + "learning_rate": 2.1506672448747503e-05, + "loss": 2.7481, + "step": 5499500 + }, + { + "epoch": 1.7097550853556367, + "grad_norm": 10.029629707336426, + "learning_rate": 2.1504081910739387e-05, + "loss": 2.733, + "step": 5500000 + }, + { + "epoch": 1.7099105176361236, + "grad_norm": 11.080695152282715, + "learning_rate": 2.150149137273127e-05, + "loss": 2.7576, + "step": 5500500 + }, + { + "epoch": 1.7100659499166104, + "grad_norm": 9.937925338745117, + "learning_rate": 2.1498900834723158e-05, + "loss": 2.7447, + "step": 5501000 + }, + { + "epoch": 1.7102213821970973, + "grad_norm": 8.016855239868164, + "learning_rate": 2.1496310296715045e-05, + "loss": 2.7338, + "step": 5501500 + }, + { + "epoch": 1.7103768144775844, + "grad_norm": 11.547664642333984, + "learning_rate": 2.149371975870693e-05, + "loss": 2.7228, + "step": 5502000 + }, + { + "epoch": 1.7105322467580713, + "grad_norm": 9.017148971557617, + "learning_rate": 2.1491129220698813e-05, + "loss": 2.7663, + "step": 5502500 + }, + { + "epoch": 1.7106876790385581, + "grad_norm": 18.297746658325195, + "learning_rate": 2.14885386826907e-05, + "loss": 2.7784, + "step": 5503000 + }, + { + "epoch": 1.710843111319045, + "grad_norm": 11.095215797424316, + "learning_rate": 2.1485948144682587e-05, + "loss": 2.7397, + "step": 5503500 + }, + { + "epoch": 1.7109985435995319, + "grad_norm": 17.50175666809082, + "learning_rate": 2.148335760667447e-05, + "loss": 2.726, + "step": 5504000 + }, + { + "epoch": 1.7111539758800187, + "grad_norm": 8.707859992980957, + "learning_rate": 2.1480767068666355e-05, + "loss": 2.7631, + "step": 5504500 + }, + { + "epoch": 1.7113094081605056, + "grad_norm": 9.821239471435547, + "learning_rate": 2.1478176530658242e-05, + "loss": 2.7532, + "step": 5505000 + }, + { + "epoch": 1.7114648404409925, + "grad_norm": 10.101760864257812, + "learning_rate": 2.1475585992650126e-05, + "loss": 2.7355, + "step": 5505500 + }, + { + "epoch": 1.7116202727214793, + "grad_norm": 9.476603507995605, + "learning_rate": 2.1472995454642013e-05, + "loss": 2.7532, + "step": 5506000 + }, + { + "epoch": 1.7117757050019662, + "grad_norm": 6.921743869781494, + "learning_rate": 2.1470404916633897e-05, + "loss": 2.7592, + "step": 5506500 + }, + { + "epoch": 1.711931137282453, + "grad_norm": 9.812281608581543, + "learning_rate": 2.1467814378625784e-05, + "loss": 2.7756, + "step": 5507000 + }, + { + "epoch": 1.71208656956294, + "grad_norm": 14.62448501586914, + "learning_rate": 2.1465223840617667e-05, + "loss": 2.746, + "step": 5507500 + }, + { + "epoch": 1.712242001843427, + "grad_norm": 9.900371551513672, + "learning_rate": 2.146263330260955e-05, + "loss": 2.7555, + "step": 5508000 + }, + { + "epoch": 1.7123974341239139, + "grad_norm": 8.628854751586914, + "learning_rate": 2.1460042764601442e-05, + "loss": 2.7789, + "step": 5508500 + }, + { + "epoch": 1.7125528664044007, + "grad_norm": 8.07375431060791, + "learning_rate": 2.1457452226593325e-05, + "loss": 2.7995, + "step": 5509000 + }, + { + "epoch": 1.7127082986848876, + "grad_norm": 11.577656745910645, + "learning_rate": 2.145486168858521e-05, + "loss": 2.7264, + "step": 5509500 + }, + { + "epoch": 1.7128637309653745, + "grad_norm": 7.094304084777832, + "learning_rate": 2.1452271150577093e-05, + "loss": 2.7277, + "step": 5510000 + }, + { + "epoch": 1.7130191632458613, + "grad_norm": 9.545989990234375, + "learning_rate": 2.144968061256898e-05, + "loss": 2.8063, + "step": 5510500 + }, + { + "epoch": 1.7131745955263482, + "grad_norm": 8.905646324157715, + "learning_rate": 2.1447090074560867e-05, + "loss": 2.7702, + "step": 5511000 + }, + { + "epoch": 1.713330027806835, + "grad_norm": 9.40711498260498, + "learning_rate": 2.144449953655275e-05, + "loss": 2.7503, + "step": 5511500 + }, + { + "epoch": 1.713485460087322, + "grad_norm": 10.803182601928711, + "learning_rate": 2.1441908998544638e-05, + "loss": 2.792, + "step": 5512000 + }, + { + "epoch": 1.7136408923678088, + "grad_norm": 8.128412246704102, + "learning_rate": 2.1439318460536522e-05, + "loss": 2.7501, + "step": 5512500 + }, + { + "epoch": 1.7137963246482957, + "grad_norm": 29.202911376953125, + "learning_rate": 2.1436727922528406e-05, + "loss": 2.7488, + "step": 5513000 + }, + { + "epoch": 1.7139517569287825, + "grad_norm": 9.593265533447266, + "learning_rate": 2.1434137384520293e-05, + "loss": 2.7306, + "step": 5513500 + }, + { + "epoch": 1.7141071892092694, + "grad_norm": 6.9481096267700195, + "learning_rate": 2.143154684651218e-05, + "loss": 2.7196, + "step": 5514000 + }, + { + "epoch": 1.7142626214897563, + "grad_norm": 11.614511489868164, + "learning_rate": 2.1428956308504064e-05, + "loss": 2.7357, + "step": 5514500 + }, + { + "epoch": 1.7144180537702431, + "grad_norm": 13.892868041992188, + "learning_rate": 2.1426365770495948e-05, + "loss": 2.7328, + "step": 5515000 + }, + { + "epoch": 1.71457348605073, + "grad_norm": 9.097023010253906, + "learning_rate": 2.1423775232487835e-05, + "loss": 2.7533, + "step": 5515500 + }, + { + "epoch": 1.7147289183312169, + "grad_norm": 9.013625144958496, + "learning_rate": 2.1421184694479722e-05, + "loss": 2.742, + "step": 5516000 + }, + { + "epoch": 1.7148843506117037, + "grad_norm": 9.053491592407227, + "learning_rate": 2.1418594156471606e-05, + "loss": 2.7143, + "step": 5516500 + }, + { + "epoch": 1.7150397828921906, + "grad_norm": 8.948716163635254, + "learning_rate": 2.141600361846349e-05, + "loss": 2.7553, + "step": 5517000 + }, + { + "epoch": 1.7151952151726775, + "grad_norm": 9.797051429748535, + "learning_rate": 2.1413413080455377e-05, + "loss": 2.767, + "step": 5517500 + }, + { + "epoch": 1.7153506474531643, + "grad_norm": 10.344429016113281, + "learning_rate": 2.141082254244726e-05, + "loss": 2.7577, + "step": 5518000 + }, + { + "epoch": 1.7155060797336512, + "grad_norm": 21.948286056518555, + "learning_rate": 2.1408232004439148e-05, + "loss": 2.78, + "step": 5518500 + }, + { + "epoch": 1.715661512014138, + "grad_norm": 10.552361488342285, + "learning_rate": 2.140564146643103e-05, + "loss": 2.7513, + "step": 5519000 + }, + { + "epoch": 1.715816944294625, + "grad_norm": 7.580591201782227, + "learning_rate": 2.140305092842292e-05, + "loss": 2.7366, + "step": 5519500 + }, + { + "epoch": 1.7159723765751118, + "grad_norm": 11.184033393859863, + "learning_rate": 2.1400460390414802e-05, + "loss": 2.7345, + "step": 5520000 + }, + { + "epoch": 1.7161278088555987, + "grad_norm": 7.641631603240967, + "learning_rate": 2.139786985240669e-05, + "loss": 2.701, + "step": 5520500 + }, + { + "epoch": 1.7162832411360855, + "grad_norm": 10.630261421203613, + "learning_rate": 2.1395279314398577e-05, + "loss": 2.7774, + "step": 5521000 + }, + { + "epoch": 1.7164386734165724, + "grad_norm": 8.731301307678223, + "learning_rate": 2.139268877639046e-05, + "loss": 2.7079, + "step": 5521500 + }, + { + "epoch": 1.7165941056970593, + "grad_norm": 9.954436302185059, + "learning_rate": 2.1390098238382344e-05, + "loss": 2.7124, + "step": 5522000 + }, + { + "epoch": 1.7167495379775461, + "grad_norm": 10.347540855407715, + "learning_rate": 2.1387507700374228e-05, + "loss": 2.7526, + "step": 5522500 + }, + { + "epoch": 1.716904970258033, + "grad_norm": 7.516777515411377, + "learning_rate": 2.1384917162366115e-05, + "loss": 2.7644, + "step": 5523000 + }, + { + "epoch": 1.7170604025385199, + "grad_norm": 10.309280395507812, + "learning_rate": 2.1382326624358002e-05, + "loss": 2.7671, + "step": 5523500 + }, + { + "epoch": 1.7172158348190067, + "grad_norm": 8.318767547607422, + "learning_rate": 2.1379736086349886e-05, + "loss": 2.7347, + "step": 5524000 + }, + { + "epoch": 1.7173712670994936, + "grad_norm": 7.9413981437683105, + "learning_rate": 2.137714554834177e-05, + "loss": 2.7546, + "step": 5524500 + }, + { + "epoch": 1.7175266993799805, + "grad_norm": 13.300775527954102, + "learning_rate": 2.1374555010333657e-05, + "loss": 2.7052, + "step": 5525000 + }, + { + "epoch": 1.7176821316604673, + "grad_norm": 48.25790786743164, + "learning_rate": 2.1371964472325544e-05, + "loss": 2.6766, + "step": 5525500 + }, + { + "epoch": 1.7178375639409544, + "grad_norm": 10.52775764465332, + "learning_rate": 2.1369373934317428e-05, + "loss": 2.7521, + "step": 5526000 + }, + { + "epoch": 1.7179929962214413, + "grad_norm": 6.934577941894531, + "learning_rate": 2.1366783396309315e-05, + "loss": 2.7374, + "step": 5526500 + }, + { + "epoch": 1.7181484285019282, + "grad_norm": 9.241016387939453, + "learning_rate": 2.13641928583012e-05, + "loss": 2.7214, + "step": 5527000 + }, + { + "epoch": 1.718303860782415, + "grad_norm": 9.448487281799316, + "learning_rate": 2.1361602320293082e-05, + "loss": 2.7197, + "step": 5527500 + }, + { + "epoch": 1.718459293062902, + "grad_norm": 8.839421272277832, + "learning_rate": 2.135901178228497e-05, + "loss": 2.7386, + "step": 5528000 + }, + { + "epoch": 1.7186147253433888, + "grad_norm": 6.377169609069824, + "learning_rate": 2.1356421244276857e-05, + "loss": 2.7367, + "step": 5528500 + }, + { + "epoch": 1.7187701576238756, + "grad_norm": 6.644554615020752, + "learning_rate": 2.135383070626874e-05, + "loss": 2.7266, + "step": 5529000 + }, + { + "epoch": 1.7189255899043625, + "grad_norm": 8.795181274414062, + "learning_rate": 2.1351240168260624e-05, + "loss": 2.7245, + "step": 5529500 + }, + { + "epoch": 1.7190810221848494, + "grad_norm": 12.831358909606934, + "learning_rate": 2.134864963025251e-05, + "loss": 2.7309, + "step": 5530000 + }, + { + "epoch": 1.7192364544653362, + "grad_norm": 6.492427349090576, + "learning_rate": 2.13460590922444e-05, + "loss": 2.7682, + "step": 5530500 + }, + { + "epoch": 1.719391886745823, + "grad_norm": 50.04000473022461, + "learning_rate": 2.1343468554236282e-05, + "loss": 2.7072, + "step": 5531000 + }, + { + "epoch": 1.71954731902631, + "grad_norm": 8.347942352294922, + "learning_rate": 2.1340878016228166e-05, + "loss": 2.7544, + "step": 5531500 + }, + { + "epoch": 1.719702751306797, + "grad_norm": 7.828156471252441, + "learning_rate": 2.1338287478220053e-05, + "loss": 2.7724, + "step": 5532000 + }, + { + "epoch": 1.719858183587284, + "grad_norm": 10.84539794921875, + "learning_rate": 2.1335696940211937e-05, + "loss": 2.7704, + "step": 5532500 + }, + { + "epoch": 1.7200136158677708, + "grad_norm": 7.751970291137695, + "learning_rate": 2.1333106402203824e-05, + "loss": 2.7799, + "step": 5533000 + }, + { + "epoch": 1.7201690481482577, + "grad_norm": 8.845209121704102, + "learning_rate": 2.1330515864195708e-05, + "loss": 2.7782, + "step": 5533500 + }, + { + "epoch": 1.7203244804287445, + "grad_norm": 11.159732818603516, + "learning_rate": 2.1327925326187595e-05, + "loss": 2.7247, + "step": 5534000 + }, + { + "epoch": 1.7204799127092314, + "grad_norm": 9.229079246520996, + "learning_rate": 2.132533478817948e-05, + "loss": 2.7339, + "step": 5534500 + }, + { + "epoch": 1.7206353449897183, + "grad_norm": 7.828648567199707, + "learning_rate": 2.1322744250171363e-05, + "loss": 2.7718, + "step": 5535000 + }, + { + "epoch": 1.7207907772702051, + "grad_norm": 8.134477615356445, + "learning_rate": 2.1320153712163253e-05, + "loss": 2.7598, + "step": 5535500 + }, + { + "epoch": 1.720946209550692, + "grad_norm": 5.847896575927734, + "learning_rate": 2.1317563174155137e-05, + "loss": 2.7119, + "step": 5536000 + }, + { + "epoch": 1.7211016418311789, + "grad_norm": 9.400291442871094, + "learning_rate": 2.131497263614702e-05, + "loss": 2.7674, + "step": 5536500 + }, + { + "epoch": 1.7212570741116657, + "grad_norm": 8.427459716796875, + "learning_rate": 2.1312382098138904e-05, + "loss": 2.7095, + "step": 5537000 + }, + { + "epoch": 1.7214125063921526, + "grad_norm": 10.002022743225098, + "learning_rate": 2.130979156013079e-05, + "loss": 2.7413, + "step": 5537500 + }, + { + "epoch": 1.7215679386726395, + "grad_norm": 6.855698585510254, + "learning_rate": 2.130720102212268e-05, + "loss": 2.7431, + "step": 5538000 + }, + { + "epoch": 1.7217233709531263, + "grad_norm": 9.977048873901367, + "learning_rate": 2.1304610484114563e-05, + "loss": 2.7316, + "step": 5538500 + }, + { + "epoch": 1.7218788032336132, + "grad_norm": 8.163079261779785, + "learning_rate": 2.130201994610645e-05, + "loss": 2.7404, + "step": 5539000 + }, + { + "epoch": 1.7220342355141, + "grad_norm": 11.769575119018555, + "learning_rate": 2.1299429408098333e-05, + "loss": 2.771, + "step": 5539500 + }, + { + "epoch": 1.722189667794587, + "grad_norm": 9.120787620544434, + "learning_rate": 2.1296838870090217e-05, + "loss": 2.7071, + "step": 5540000 + }, + { + "epoch": 1.7223451000750738, + "grad_norm": 4.923160552978516, + "learning_rate": 2.1294248332082104e-05, + "loss": 2.698, + "step": 5540500 + }, + { + "epoch": 1.7225005323555607, + "grad_norm": 5.839055061340332, + "learning_rate": 2.129165779407399e-05, + "loss": 2.7477, + "step": 5541000 + }, + { + "epoch": 1.7226559646360475, + "grad_norm": 10.576634407043457, + "learning_rate": 2.1289067256065875e-05, + "loss": 2.716, + "step": 5541500 + }, + { + "epoch": 1.7228113969165344, + "grad_norm": 7.528358459472656, + "learning_rate": 2.128647671805776e-05, + "loss": 2.7596, + "step": 5542000 + }, + { + "epoch": 1.7229668291970213, + "grad_norm": 10.020366668701172, + "learning_rate": 2.1283886180049646e-05, + "loss": 2.7696, + "step": 5542500 + }, + { + "epoch": 1.7231222614775081, + "grad_norm": 12.268353462219238, + "learning_rate": 2.1281295642041533e-05, + "loss": 2.6825, + "step": 5543000 + }, + { + "epoch": 1.723277693757995, + "grad_norm": 7.5734171867370605, + "learning_rate": 2.1278705104033417e-05, + "loss": 2.7025, + "step": 5543500 + }, + { + "epoch": 1.7234331260384819, + "grad_norm": 9.398946762084961, + "learning_rate": 2.12761145660253e-05, + "loss": 2.7466, + "step": 5544000 + }, + { + "epoch": 1.7235885583189687, + "grad_norm": 10.03502368927002, + "learning_rate": 2.1273524028017188e-05, + "loss": 2.7316, + "step": 5544500 + }, + { + "epoch": 1.7237439905994556, + "grad_norm": 11.99454116821289, + "learning_rate": 2.1270933490009072e-05, + "loss": 2.7565, + "step": 5545000 + }, + { + "epoch": 1.7238994228799425, + "grad_norm": 22.506919860839844, + "learning_rate": 2.126834295200096e-05, + "loss": 2.7324, + "step": 5545500 + }, + { + "epoch": 1.7240548551604293, + "grad_norm": 10.64415168762207, + "learning_rate": 2.1265752413992843e-05, + "loss": 2.7427, + "step": 5546000 + }, + { + "epoch": 1.7242102874409162, + "grad_norm": 10.139328956604004, + "learning_rate": 2.126316187598473e-05, + "loss": 2.7072, + "step": 5546500 + }, + { + "epoch": 1.724365719721403, + "grad_norm": 10.223682403564453, + "learning_rate": 2.1260571337976614e-05, + "loss": 2.7917, + "step": 5547000 + }, + { + "epoch": 1.72452115200189, + "grad_norm": 8.649810791015625, + "learning_rate": 2.12579807999685e-05, + "loss": 2.7262, + "step": 5547500 + }, + { + "epoch": 1.7246765842823768, + "grad_norm": 15.05829906463623, + "learning_rate": 2.1255390261960388e-05, + "loss": 2.7439, + "step": 5548000 + }, + { + "epoch": 1.7248320165628637, + "grad_norm": 9.946439743041992, + "learning_rate": 2.1252799723952272e-05, + "loss": 2.7223, + "step": 5548500 + }, + { + "epoch": 1.7249874488433505, + "grad_norm": 7.835216045379639, + "learning_rate": 2.1250209185944155e-05, + "loss": 2.7744, + "step": 5549000 + }, + { + "epoch": 1.7251428811238374, + "grad_norm": 12.113922119140625, + "learning_rate": 2.124761864793604e-05, + "loss": 2.7568, + "step": 5549500 + }, + { + "epoch": 1.7252983134043245, + "grad_norm": 8.220240592956543, + "learning_rate": 2.1245028109927926e-05, + "loss": 2.747, + "step": 5550000 + }, + { + "epoch": 1.7254537456848114, + "grad_norm": 9.538494110107422, + "learning_rate": 2.1242437571919814e-05, + "loss": 2.7902, + "step": 5550500 + }, + { + "epoch": 1.7256091779652982, + "grad_norm": 11.763368606567383, + "learning_rate": 2.1239847033911697e-05, + "loss": 2.7497, + "step": 5551000 + }, + { + "epoch": 1.725764610245785, + "grad_norm": 8.58471393585205, + "learning_rate": 2.123725649590358e-05, + "loss": 2.7209, + "step": 5551500 + }, + { + "epoch": 1.725920042526272, + "grad_norm": 9.332964897155762, + "learning_rate": 2.1234665957895468e-05, + "loss": 2.7272, + "step": 5552000 + }, + { + "epoch": 1.7260754748067588, + "grad_norm": 19.104236602783203, + "learning_rate": 2.1232075419887355e-05, + "loss": 2.7338, + "step": 5552500 + }, + { + "epoch": 1.7262309070872457, + "grad_norm": 33.84092712402344, + "learning_rate": 2.122948488187924e-05, + "loss": 2.7586, + "step": 5553000 + }, + { + "epoch": 1.7263863393677326, + "grad_norm": 10.496565818786621, + "learning_rate": 2.1226894343871126e-05, + "loss": 2.7512, + "step": 5553500 + }, + { + "epoch": 1.7265417716482194, + "grad_norm": 9.183778762817383, + "learning_rate": 2.122430380586301e-05, + "loss": 2.7404, + "step": 5554000 + }, + { + "epoch": 1.7266972039287063, + "grad_norm": 9.712733268737793, + "learning_rate": 2.1221713267854894e-05, + "loss": 2.6942, + "step": 5554500 + }, + { + "epoch": 1.7268526362091932, + "grad_norm": 15.476960182189941, + "learning_rate": 2.121912272984678e-05, + "loss": 2.7348, + "step": 5555000 + }, + { + "epoch": 1.72700806848968, + "grad_norm": 10.782112121582031, + "learning_rate": 2.1216532191838668e-05, + "loss": 2.7221, + "step": 5555500 + }, + { + "epoch": 1.7271635007701671, + "grad_norm": 8.887093544006348, + "learning_rate": 2.1213941653830552e-05, + "loss": 2.7088, + "step": 5556000 + }, + { + "epoch": 1.727318933050654, + "grad_norm": 9.7695894241333, + "learning_rate": 2.1211351115822436e-05, + "loss": 2.7385, + "step": 5556500 + }, + { + "epoch": 1.7274743653311408, + "grad_norm": 30.867183685302734, + "learning_rate": 2.1208760577814323e-05, + "loss": 2.7579, + "step": 5557000 + }, + { + "epoch": 1.7276297976116277, + "grad_norm": 21.246206283569336, + "learning_rate": 2.120617003980621e-05, + "loss": 2.7671, + "step": 5557500 + }, + { + "epoch": 1.7277852298921146, + "grad_norm": 9.437338829040527, + "learning_rate": 2.1203579501798094e-05, + "loss": 2.7386, + "step": 5558000 + }, + { + "epoch": 1.7279406621726014, + "grad_norm": 8.156859397888184, + "learning_rate": 2.1200988963789977e-05, + "loss": 2.7572, + "step": 5558500 + }, + { + "epoch": 1.7280960944530883, + "grad_norm": 10.63508415222168, + "learning_rate": 2.1198398425781865e-05, + "loss": 2.7442, + "step": 5559000 + }, + { + "epoch": 1.7282515267335752, + "grad_norm": 9.316888809204102, + "learning_rate": 2.119580788777375e-05, + "loss": 2.7922, + "step": 5559500 + }, + { + "epoch": 1.728406959014062, + "grad_norm": 9.668407440185547, + "learning_rate": 2.1193217349765636e-05, + "loss": 2.7047, + "step": 5560000 + }, + { + "epoch": 1.728562391294549, + "grad_norm": 8.746081352233887, + "learning_rate": 2.119062681175752e-05, + "loss": 2.731, + "step": 5560500 + }, + { + "epoch": 1.7287178235750358, + "grad_norm": 9.161542892456055, + "learning_rate": 2.1188036273749406e-05, + "loss": 2.8007, + "step": 5561000 + }, + { + "epoch": 1.7288732558555227, + "grad_norm": 9.023661613464355, + "learning_rate": 2.118544573574129e-05, + "loss": 2.7448, + "step": 5561500 + }, + { + "epoch": 1.7290286881360095, + "grad_norm": 13.098727226257324, + "learning_rate": 2.1182855197733177e-05, + "loss": 2.7422, + "step": 5562000 + }, + { + "epoch": 1.7291841204164964, + "grad_norm": 8.280029296875, + "learning_rate": 2.1180264659725065e-05, + "loss": 2.7362, + "step": 5562500 + }, + { + "epoch": 1.7293395526969833, + "grad_norm": 9.439291954040527, + "learning_rate": 2.117767412171695e-05, + "loss": 2.7164, + "step": 5563000 + }, + { + "epoch": 1.7294949849774701, + "grad_norm": 9.384684562683105, + "learning_rate": 2.1175083583708832e-05, + "loss": 2.7017, + "step": 5563500 + }, + { + "epoch": 1.729650417257957, + "grad_norm": 10.76909351348877, + "learning_rate": 2.1172493045700716e-05, + "loss": 2.7595, + "step": 5564000 + }, + { + "epoch": 1.7298058495384439, + "grad_norm": 9.35508918762207, + "learning_rate": 2.1169902507692603e-05, + "loss": 2.7468, + "step": 5564500 + }, + { + "epoch": 1.7299612818189307, + "grad_norm": 16.070528030395508, + "learning_rate": 2.116731196968449e-05, + "loss": 2.7645, + "step": 5565000 + }, + { + "epoch": 1.7301167140994176, + "grad_norm": 10.196606636047363, + "learning_rate": 2.1164721431676374e-05, + "loss": 2.8013, + "step": 5565500 + }, + { + "epoch": 1.7302721463799045, + "grad_norm": 8.37432861328125, + "learning_rate": 2.116213089366826e-05, + "loss": 2.761, + "step": 5566000 + }, + { + "epoch": 1.7304275786603913, + "grad_norm": 9.052590370178223, + "learning_rate": 2.1159540355660145e-05, + "loss": 2.7701, + "step": 5566500 + }, + { + "epoch": 1.7305830109408782, + "grad_norm": 10.019648551940918, + "learning_rate": 2.1156949817652032e-05, + "loss": 2.785, + "step": 5567000 + }, + { + "epoch": 1.730738443221365, + "grad_norm": 10.0673189163208, + "learning_rate": 2.1154359279643916e-05, + "loss": 2.6723, + "step": 5567500 + }, + { + "epoch": 1.730893875501852, + "grad_norm": 9.655309677124023, + "learning_rate": 2.1151768741635803e-05, + "loss": 2.7693, + "step": 5568000 + }, + { + "epoch": 1.7310493077823388, + "grad_norm": 7.654660701751709, + "learning_rate": 2.1149178203627687e-05, + "loss": 2.7479, + "step": 5568500 + }, + { + "epoch": 1.7312047400628257, + "grad_norm": 19.3655948638916, + "learning_rate": 2.114658766561957e-05, + "loss": 2.7462, + "step": 5569000 + }, + { + "epoch": 1.7313601723433125, + "grad_norm": 9.954401969909668, + "learning_rate": 2.1143997127611458e-05, + "loss": 2.7168, + "step": 5569500 + }, + { + "epoch": 1.7315156046237994, + "grad_norm": 9.352298736572266, + "learning_rate": 2.1141406589603345e-05, + "loss": 2.7214, + "step": 5570000 + }, + { + "epoch": 1.7316710369042863, + "grad_norm": 9.841276168823242, + "learning_rate": 2.113881605159523e-05, + "loss": 2.7545, + "step": 5570500 + }, + { + "epoch": 1.7318264691847731, + "grad_norm": 8.061737060546875, + "learning_rate": 2.1136225513587112e-05, + "loss": 2.7327, + "step": 5571000 + }, + { + "epoch": 1.73198190146526, + "grad_norm": 10.371869087219238, + "learning_rate": 2.1133634975579e-05, + "loss": 2.7778, + "step": 5571500 + }, + { + "epoch": 1.7321373337457469, + "grad_norm": 8.397411346435547, + "learning_rate": 2.1131044437570887e-05, + "loss": 2.7836, + "step": 5572000 + }, + { + "epoch": 1.7322927660262337, + "grad_norm": 8.325494766235352, + "learning_rate": 2.112845389956277e-05, + "loss": 2.7727, + "step": 5572500 + }, + { + "epoch": 1.7324481983067206, + "grad_norm": 9.406024932861328, + "learning_rate": 2.1125863361554654e-05, + "loss": 2.727, + "step": 5573000 + }, + { + "epoch": 1.7326036305872075, + "grad_norm": 14.020661354064941, + "learning_rate": 2.112327282354654e-05, + "loss": 2.6944, + "step": 5573500 + }, + { + "epoch": 1.7327590628676945, + "grad_norm": 8.264395713806152, + "learning_rate": 2.1120682285538425e-05, + "loss": 2.7262, + "step": 5574000 + }, + { + "epoch": 1.7329144951481814, + "grad_norm": 11.103447914123535, + "learning_rate": 2.1118091747530312e-05, + "loss": 2.7046, + "step": 5574500 + }, + { + "epoch": 1.7330699274286683, + "grad_norm": 10.617964744567871, + "learning_rate": 2.11155012095222e-05, + "loss": 2.736, + "step": 5575000 + }, + { + "epoch": 1.7332253597091551, + "grad_norm": 13.423675537109375, + "learning_rate": 2.1112910671514083e-05, + "loss": 2.7293, + "step": 5575500 + }, + { + "epoch": 1.733380791989642, + "grad_norm": 11.43233871459961, + "learning_rate": 2.1110320133505967e-05, + "loss": 2.7763, + "step": 5576000 + }, + { + "epoch": 1.7335362242701289, + "grad_norm": 7.384858131408691, + "learning_rate": 2.110772959549785e-05, + "loss": 2.7368, + "step": 5576500 + }, + { + "epoch": 1.7336916565506157, + "grad_norm": 10.184048652648926, + "learning_rate": 2.110513905748974e-05, + "loss": 2.7502, + "step": 5577000 + }, + { + "epoch": 1.7338470888311026, + "grad_norm": 9.634334564208984, + "learning_rate": 2.1102548519481625e-05, + "loss": 2.7266, + "step": 5577500 + }, + { + "epoch": 1.7340025211115895, + "grad_norm": 15.250049591064453, + "learning_rate": 2.109995798147351e-05, + "loss": 2.7579, + "step": 5578000 + }, + { + "epoch": 1.7341579533920763, + "grad_norm": 11.505859375, + "learning_rate": 2.1097367443465396e-05, + "loss": 2.7618, + "step": 5578500 + }, + { + "epoch": 1.7343133856725632, + "grad_norm": 102.39402770996094, + "learning_rate": 2.109477690545728e-05, + "loss": 2.7476, + "step": 5579000 + }, + { + "epoch": 1.73446881795305, + "grad_norm": 15.279481887817383, + "learning_rate": 2.1092186367449167e-05, + "loss": 2.7233, + "step": 5579500 + }, + { + "epoch": 1.734624250233537, + "grad_norm": 8.467799186706543, + "learning_rate": 2.108959582944105e-05, + "loss": 2.7366, + "step": 5580000 + }, + { + "epoch": 1.734779682514024, + "grad_norm": 11.671953201293945, + "learning_rate": 2.1087005291432938e-05, + "loss": 2.754, + "step": 5580500 + }, + { + "epoch": 1.734935114794511, + "grad_norm": 12.527445793151855, + "learning_rate": 2.108441475342482e-05, + "loss": 2.7414, + "step": 5581000 + }, + { + "epoch": 1.7350905470749978, + "grad_norm": 8.117383003234863, + "learning_rate": 2.1081824215416705e-05, + "loss": 2.7243, + "step": 5581500 + }, + { + "epoch": 1.7352459793554846, + "grad_norm": 24.902292251586914, + "learning_rate": 2.1079233677408592e-05, + "loss": 2.7716, + "step": 5582000 + }, + { + "epoch": 1.7354014116359715, + "grad_norm": 8.752123832702637, + "learning_rate": 2.107664313940048e-05, + "loss": 2.7324, + "step": 5582500 + }, + { + "epoch": 1.7355568439164584, + "grad_norm": 15.148240089416504, + "learning_rate": 2.1074052601392363e-05, + "loss": 2.8022, + "step": 5583000 + }, + { + "epoch": 1.7357122761969452, + "grad_norm": 10.36146068572998, + "learning_rate": 2.1071462063384247e-05, + "loss": 2.7046, + "step": 5583500 + }, + { + "epoch": 1.735867708477432, + "grad_norm": 26.885385513305664, + "learning_rate": 2.1068871525376134e-05, + "loss": 2.6898, + "step": 5584000 + }, + { + "epoch": 1.736023140757919, + "grad_norm": 10.354341506958008, + "learning_rate": 2.106628098736802e-05, + "loss": 2.7576, + "step": 5584500 + }, + { + "epoch": 1.7361785730384058, + "grad_norm": 7.398693561553955, + "learning_rate": 2.1063690449359905e-05, + "loss": 2.7432, + "step": 5585000 + }, + { + "epoch": 1.7363340053188927, + "grad_norm": 11.687897682189941, + "learning_rate": 2.106109991135179e-05, + "loss": 2.734, + "step": 5585500 + }, + { + "epoch": 1.7364894375993796, + "grad_norm": 14.692216873168945, + "learning_rate": 2.1058509373343676e-05, + "loss": 2.7625, + "step": 5586000 + }, + { + "epoch": 1.7366448698798664, + "grad_norm": 14.024373054504395, + "learning_rate": 2.105591883533556e-05, + "loss": 2.6501, + "step": 5586500 + }, + { + "epoch": 1.7368003021603533, + "grad_norm": 9.378252029418945, + "learning_rate": 2.1053328297327447e-05, + "loss": 2.7635, + "step": 5587000 + }, + { + "epoch": 1.7369557344408402, + "grad_norm": 8.909526824951172, + "learning_rate": 2.1050737759319334e-05, + "loss": 2.7164, + "step": 5587500 + }, + { + "epoch": 1.737111166721327, + "grad_norm": 9.924139976501465, + "learning_rate": 2.1048147221311218e-05, + "loss": 2.7525, + "step": 5588000 + }, + { + "epoch": 1.737266599001814, + "grad_norm": 9.620555877685547, + "learning_rate": 2.10455566833031e-05, + "loss": 2.6755, + "step": 5588500 + }, + { + "epoch": 1.7374220312823008, + "grad_norm": 8.180747985839844, + "learning_rate": 2.104296614529499e-05, + "loss": 2.802, + "step": 5589000 + }, + { + "epoch": 1.7375774635627876, + "grad_norm": 8.831352233886719, + "learning_rate": 2.1040375607286876e-05, + "loss": 2.7287, + "step": 5589500 + }, + { + "epoch": 1.7377328958432745, + "grad_norm": 8.544135093688965, + "learning_rate": 2.103778506927876e-05, + "loss": 2.7301, + "step": 5590000 + }, + { + "epoch": 1.7378883281237614, + "grad_norm": 10.80766773223877, + "learning_rate": 2.1035194531270644e-05, + "loss": 2.7223, + "step": 5590500 + }, + { + "epoch": 1.7380437604042482, + "grad_norm": 10.104358673095703, + "learning_rate": 2.1032603993262527e-05, + "loss": 2.686, + "step": 5591000 + }, + { + "epoch": 1.738199192684735, + "grad_norm": 9.508543014526367, + "learning_rate": 2.1030013455254414e-05, + "loss": 2.727, + "step": 5591500 + }, + { + "epoch": 1.738354624965222, + "grad_norm": 10.287943840026855, + "learning_rate": 2.10274229172463e-05, + "loss": 2.7281, + "step": 5592000 + }, + { + "epoch": 1.7385100572457088, + "grad_norm": 9.550997734069824, + "learning_rate": 2.1024832379238185e-05, + "loss": 2.7308, + "step": 5592500 + }, + { + "epoch": 1.7386654895261957, + "grad_norm": 8.031083106994629, + "learning_rate": 2.1022241841230072e-05, + "loss": 2.7719, + "step": 5593000 + }, + { + "epoch": 1.7388209218066826, + "grad_norm": 13.268560409545898, + "learning_rate": 2.1019651303221956e-05, + "loss": 2.7254, + "step": 5593500 + }, + { + "epoch": 1.7389763540871694, + "grad_norm": 8.850129127502441, + "learning_rate": 2.1017060765213843e-05, + "loss": 2.7386, + "step": 5594000 + }, + { + "epoch": 1.7391317863676563, + "grad_norm": 11.035103797912598, + "learning_rate": 2.1014470227205727e-05, + "loss": 2.6744, + "step": 5594500 + }, + { + "epoch": 1.7392872186481432, + "grad_norm": 11.206184387207031, + "learning_rate": 2.1011879689197614e-05, + "loss": 2.7532, + "step": 5595000 + }, + { + "epoch": 1.73944265092863, + "grad_norm": 8.120749473571777, + "learning_rate": 2.1009289151189498e-05, + "loss": 2.7481, + "step": 5595500 + }, + { + "epoch": 1.739598083209117, + "grad_norm": 8.468664169311523, + "learning_rate": 2.1006698613181382e-05, + "loss": 2.7499, + "step": 5596000 + }, + { + "epoch": 1.7397535154896038, + "grad_norm": 9.79216194152832, + "learning_rate": 2.100410807517327e-05, + "loss": 2.734, + "step": 5596500 + }, + { + "epoch": 1.7399089477700906, + "grad_norm": 11.724173545837402, + "learning_rate": 2.1001517537165156e-05, + "loss": 2.776, + "step": 5597000 + }, + { + "epoch": 1.7400643800505775, + "grad_norm": 5.806468963623047, + "learning_rate": 2.099892699915704e-05, + "loss": 2.6929, + "step": 5597500 + }, + { + "epoch": 1.7402198123310644, + "grad_norm": 9.624393463134766, + "learning_rate": 2.0996336461148924e-05, + "loss": 2.7368, + "step": 5598000 + }, + { + "epoch": 1.7403752446115515, + "grad_norm": 29.02776336669922, + "learning_rate": 2.099374592314081e-05, + "loss": 2.7613, + "step": 5598500 + }, + { + "epoch": 1.7405306768920383, + "grad_norm": 10.071310043334961, + "learning_rate": 2.0991155385132698e-05, + "loss": 2.7215, + "step": 5599000 + }, + { + "epoch": 1.7406861091725252, + "grad_norm": 10.578564643859863, + "learning_rate": 2.0988564847124582e-05, + "loss": 2.7716, + "step": 5599500 + }, + { + "epoch": 1.740841541453012, + "grad_norm": 13.545334815979004, + "learning_rate": 2.0985974309116466e-05, + "loss": 2.7091, + "step": 5600000 + }, + { + "epoch": 1.740996973733499, + "grad_norm": 10.71084213256836, + "learning_rate": 2.0983383771108353e-05, + "loss": 2.7529, + "step": 5600500 + }, + { + "epoch": 1.7411524060139858, + "grad_norm": 9.950919151306152, + "learning_rate": 2.0980793233100236e-05, + "loss": 2.721, + "step": 5601000 + }, + { + "epoch": 1.7413078382944727, + "grad_norm": 10.557405471801758, + "learning_rate": 2.0978202695092124e-05, + "loss": 2.6817, + "step": 5601500 + }, + { + "epoch": 1.7414632705749595, + "grad_norm": 9.379722595214844, + "learning_rate": 2.097561215708401e-05, + "loss": 2.7597, + "step": 5602000 + }, + { + "epoch": 1.7416187028554464, + "grad_norm": 9.833681106567383, + "learning_rate": 2.0973021619075895e-05, + "loss": 2.7509, + "step": 5602500 + }, + { + "epoch": 1.7417741351359333, + "grad_norm": 9.894307136535645, + "learning_rate": 2.0970431081067778e-05, + "loss": 2.7466, + "step": 5603000 + }, + { + "epoch": 1.7419295674164201, + "grad_norm": 10.359843254089355, + "learning_rate": 2.0967840543059662e-05, + "loss": 2.7199, + "step": 5603500 + }, + { + "epoch": 1.742084999696907, + "grad_norm": 7.667438983917236, + "learning_rate": 2.0965250005051553e-05, + "loss": 2.7248, + "step": 5604000 + }, + { + "epoch": 1.742240431977394, + "grad_norm": 16.859233856201172, + "learning_rate": 2.0962659467043436e-05, + "loss": 2.7898, + "step": 5604500 + }, + { + "epoch": 1.742395864257881, + "grad_norm": 11.312005996704102, + "learning_rate": 2.096006892903532e-05, + "loss": 2.7685, + "step": 5605000 + }, + { + "epoch": 1.7425512965383678, + "grad_norm": 7.943682670593262, + "learning_rate": 2.0957478391027207e-05, + "loss": 2.7574, + "step": 5605500 + }, + { + "epoch": 1.7427067288188547, + "grad_norm": 10.02105712890625, + "learning_rate": 2.095488785301909e-05, + "loss": 2.7448, + "step": 5606000 + }, + { + "epoch": 1.7428621610993416, + "grad_norm": 8.261442184448242, + "learning_rate": 2.0952297315010978e-05, + "loss": 2.748, + "step": 5606500 + }, + { + "epoch": 1.7430175933798284, + "grad_norm": 8.683956146240234, + "learning_rate": 2.0949706777002862e-05, + "loss": 2.778, + "step": 5607000 + }, + { + "epoch": 1.7431730256603153, + "grad_norm": 7.802090644836426, + "learning_rate": 2.094711623899475e-05, + "loss": 2.7128, + "step": 5607500 + }, + { + "epoch": 1.7433284579408022, + "grad_norm": 11.075742721557617, + "learning_rate": 2.0944525700986633e-05, + "loss": 2.7468, + "step": 5608000 + }, + { + "epoch": 1.743483890221289, + "grad_norm": 9.0062837600708, + "learning_rate": 2.0941935162978517e-05, + "loss": 2.7086, + "step": 5608500 + }, + { + "epoch": 1.743639322501776, + "grad_norm": 10.941509246826172, + "learning_rate": 2.0939344624970404e-05, + "loss": 2.7688, + "step": 5609000 + }, + { + "epoch": 1.7437947547822628, + "grad_norm": 16.00079917907715, + "learning_rate": 2.093675408696229e-05, + "loss": 2.7343, + "step": 5609500 + }, + { + "epoch": 1.7439501870627496, + "grad_norm": 8.903157234191895, + "learning_rate": 2.0934163548954175e-05, + "loss": 2.7287, + "step": 5610000 + }, + { + "epoch": 1.7441056193432365, + "grad_norm": 18.88021469116211, + "learning_rate": 2.093157301094606e-05, + "loss": 2.7418, + "step": 5610500 + }, + { + "epoch": 1.7442610516237234, + "grad_norm": 8.177295684814453, + "learning_rate": 2.0928982472937946e-05, + "loss": 2.7075, + "step": 5611000 + }, + { + "epoch": 1.7444164839042102, + "grad_norm": 10.538715362548828, + "learning_rate": 2.0926391934929833e-05, + "loss": 2.7382, + "step": 5611500 + }, + { + "epoch": 1.744571916184697, + "grad_norm": 9.37331485748291, + "learning_rate": 2.0923801396921717e-05, + "loss": 2.7282, + "step": 5612000 + }, + { + "epoch": 1.744727348465184, + "grad_norm": 10.562308311462402, + "learning_rate": 2.09212108589136e-05, + "loss": 2.7364, + "step": 5612500 + }, + { + "epoch": 1.7448827807456708, + "grad_norm": 8.09030818939209, + "learning_rate": 2.0918620320905487e-05, + "loss": 2.755, + "step": 5613000 + }, + { + "epoch": 1.7450382130261577, + "grad_norm": 8.9757719039917, + "learning_rate": 2.091602978289737e-05, + "loss": 2.7698, + "step": 5613500 + }, + { + "epoch": 1.7451936453066446, + "grad_norm": 9.980052947998047, + "learning_rate": 2.091343924488926e-05, + "loss": 2.7719, + "step": 5614000 + }, + { + "epoch": 1.7453490775871314, + "grad_norm": 8.560591697692871, + "learning_rate": 2.0910848706881146e-05, + "loss": 2.7341, + "step": 5614500 + }, + { + "epoch": 1.7455045098676183, + "grad_norm": 10.04994010925293, + "learning_rate": 2.090825816887303e-05, + "loss": 2.7254, + "step": 5615000 + }, + { + "epoch": 1.7456599421481052, + "grad_norm": 9.152693748474121, + "learning_rate": 2.0905667630864913e-05, + "loss": 2.7299, + "step": 5615500 + }, + { + "epoch": 1.745815374428592, + "grad_norm": 11.398470878601074, + "learning_rate": 2.09030770928568e-05, + "loss": 2.6999, + "step": 5616000 + }, + { + "epoch": 1.745970806709079, + "grad_norm": 9.507048606872559, + "learning_rate": 2.0900486554848687e-05, + "loss": 2.7023, + "step": 5616500 + }, + { + "epoch": 1.7461262389895658, + "grad_norm": 9.508527755737305, + "learning_rate": 2.089789601684057e-05, + "loss": 2.7283, + "step": 5617000 + }, + { + "epoch": 1.7462816712700526, + "grad_norm": 8.554730415344238, + "learning_rate": 2.0895305478832455e-05, + "loss": 2.7144, + "step": 5617500 + }, + { + "epoch": 1.7464371035505395, + "grad_norm": 9.085792541503906, + "learning_rate": 2.089271494082434e-05, + "loss": 2.7253, + "step": 5618000 + }, + { + "epoch": 1.7465925358310264, + "grad_norm": 14.0467529296875, + "learning_rate": 2.0890124402816226e-05, + "loss": 2.7081, + "step": 5618500 + }, + { + "epoch": 1.7467479681115132, + "grad_norm": 14.392855644226074, + "learning_rate": 2.0887533864808113e-05, + "loss": 2.715, + "step": 5619000 + }, + { + "epoch": 1.746903400392, + "grad_norm": 10.356927871704102, + "learning_rate": 2.0884943326799997e-05, + "loss": 2.7946, + "step": 5619500 + }, + { + "epoch": 1.747058832672487, + "grad_norm": 9.411283493041992, + "learning_rate": 2.0882352788791884e-05, + "loss": 2.7456, + "step": 5620000 + }, + { + "epoch": 1.7472142649529738, + "grad_norm": 8.05736255645752, + "learning_rate": 2.0879762250783768e-05, + "loss": 2.7445, + "step": 5620500 + }, + { + "epoch": 1.7473696972334607, + "grad_norm": 10.05606460571289, + "learning_rate": 2.0877171712775655e-05, + "loss": 2.7346, + "step": 5621000 + }, + { + "epoch": 1.7475251295139476, + "grad_norm": 10.959057807922363, + "learning_rate": 2.087458117476754e-05, + "loss": 2.7153, + "step": 5621500 + }, + { + "epoch": 1.7476805617944344, + "grad_norm": 8.718178749084473, + "learning_rate": 2.0871990636759426e-05, + "loss": 2.7347, + "step": 5622000 + }, + { + "epoch": 1.7478359940749215, + "grad_norm": 9.90658950805664, + "learning_rate": 2.086940009875131e-05, + "loss": 2.7298, + "step": 5622500 + }, + { + "epoch": 1.7479914263554084, + "grad_norm": 10.467490196228027, + "learning_rate": 2.0866809560743193e-05, + "loss": 2.736, + "step": 5623000 + }, + { + "epoch": 1.7481468586358953, + "grad_norm": 11.15328598022461, + "learning_rate": 2.086421902273508e-05, + "loss": 2.6902, + "step": 5623500 + }, + { + "epoch": 1.7483022909163821, + "grad_norm": 17.30813217163086, + "learning_rate": 2.0861628484726968e-05, + "loss": 2.6965, + "step": 5624000 + }, + { + "epoch": 1.748457723196869, + "grad_norm": 8.98267936706543, + "learning_rate": 2.085903794671885e-05, + "loss": 2.6702, + "step": 5624500 + }, + { + "epoch": 1.7486131554773559, + "grad_norm": 16.18772315979004, + "learning_rate": 2.0856447408710735e-05, + "loss": 2.744, + "step": 5625000 + }, + { + "epoch": 1.7487685877578427, + "grad_norm": 10.430644035339355, + "learning_rate": 2.0853856870702622e-05, + "loss": 2.7247, + "step": 5625500 + }, + { + "epoch": 1.7489240200383296, + "grad_norm": 10.055732727050781, + "learning_rate": 2.085126633269451e-05, + "loss": 2.725, + "step": 5626000 + }, + { + "epoch": 1.7490794523188165, + "grad_norm": 9.562021255493164, + "learning_rate": 2.0848675794686393e-05, + "loss": 2.7321, + "step": 5626500 + }, + { + "epoch": 1.7492348845993033, + "grad_norm": 8.39257526397705, + "learning_rate": 2.0846085256678277e-05, + "loss": 2.6995, + "step": 5627000 + }, + { + "epoch": 1.7493903168797902, + "grad_norm": 10.704808235168457, + "learning_rate": 2.0843494718670164e-05, + "loss": 2.7354, + "step": 5627500 + }, + { + "epoch": 1.749545749160277, + "grad_norm": 25.266178131103516, + "learning_rate": 2.0840904180662048e-05, + "loss": 2.7414, + "step": 5628000 + }, + { + "epoch": 1.7497011814407641, + "grad_norm": 18.334131240844727, + "learning_rate": 2.0838313642653935e-05, + "loss": 2.7103, + "step": 5628500 + }, + { + "epoch": 1.749856613721251, + "grad_norm": 37.557125091552734, + "learning_rate": 2.0835723104645822e-05, + "loss": 2.721, + "step": 5629000 + }, + { + "epoch": 1.7500120460017379, + "grad_norm": 9.988791465759277, + "learning_rate": 2.0833132566637706e-05, + "loss": 2.7645, + "step": 5629500 + }, + { + "epoch": 1.7501674782822247, + "grad_norm": 9.219799041748047, + "learning_rate": 2.083054202862959e-05, + "loss": 2.7153, + "step": 5630000 + }, + { + "epoch": 1.7503229105627116, + "grad_norm": 10.678006172180176, + "learning_rate": 2.0827951490621473e-05, + "loss": 2.748, + "step": 5630500 + }, + { + "epoch": 1.7504783428431985, + "grad_norm": 11.18124771118164, + "learning_rate": 2.0825360952613364e-05, + "loss": 2.7162, + "step": 5631000 + }, + { + "epoch": 1.7506337751236853, + "grad_norm": 12.522735595703125, + "learning_rate": 2.0822770414605248e-05, + "loss": 2.7474, + "step": 5631500 + }, + { + "epoch": 1.7507892074041722, + "grad_norm": 15.868451118469238, + "learning_rate": 2.082017987659713e-05, + "loss": 2.7166, + "step": 5632000 + }, + { + "epoch": 1.750944639684659, + "grad_norm": 8.41861629486084, + "learning_rate": 2.081758933858902e-05, + "loss": 2.7619, + "step": 5632500 + }, + { + "epoch": 1.751100071965146, + "grad_norm": 11.118828773498535, + "learning_rate": 2.0814998800580902e-05, + "loss": 2.744, + "step": 5633000 + }, + { + "epoch": 1.7512555042456328, + "grad_norm": 12.942522048950195, + "learning_rate": 2.081240826257279e-05, + "loss": 2.7072, + "step": 5633500 + }, + { + "epoch": 1.7514109365261197, + "grad_norm": 7.926102638244629, + "learning_rate": 2.0809817724564673e-05, + "loss": 2.7505, + "step": 5634000 + }, + { + "epoch": 1.7515663688066065, + "grad_norm": 8.878110885620117, + "learning_rate": 2.080722718655656e-05, + "loss": 2.757, + "step": 5634500 + }, + { + "epoch": 1.7517218010870934, + "grad_norm": 8.487936973571777, + "learning_rate": 2.0804636648548444e-05, + "loss": 2.7061, + "step": 5635000 + }, + { + "epoch": 1.7518772333675803, + "grad_norm": 10.067526817321777, + "learning_rate": 2.0802046110540328e-05, + "loss": 2.7887, + "step": 5635500 + }, + { + "epoch": 1.7520326656480671, + "grad_norm": 11.278448104858398, + "learning_rate": 2.0799455572532215e-05, + "loss": 2.7217, + "step": 5636000 + }, + { + "epoch": 1.752188097928554, + "grad_norm": 10.852230072021484, + "learning_rate": 2.0796865034524102e-05, + "loss": 2.721, + "step": 5636500 + }, + { + "epoch": 1.7523435302090409, + "grad_norm": 10.169709205627441, + "learning_rate": 2.0794274496515986e-05, + "loss": 2.7197, + "step": 5637000 + }, + { + "epoch": 1.7524989624895277, + "grad_norm": 8.536920547485352, + "learning_rate": 2.079168395850787e-05, + "loss": 2.7295, + "step": 5637500 + }, + { + "epoch": 1.7526543947700146, + "grad_norm": 11.86166000366211, + "learning_rate": 2.0789093420499757e-05, + "loss": 2.6904, + "step": 5638000 + }, + { + "epoch": 1.7528098270505015, + "grad_norm": 11.02828311920166, + "learning_rate": 2.0786502882491644e-05, + "loss": 2.7321, + "step": 5638500 + }, + { + "epoch": 1.7529652593309883, + "grad_norm": 7.6820387840271, + "learning_rate": 2.0783912344483528e-05, + "loss": 2.7548, + "step": 5639000 + }, + { + "epoch": 1.7531206916114752, + "grad_norm": 9.8980073928833, + "learning_rate": 2.0781321806475412e-05, + "loss": 2.7562, + "step": 5639500 + }, + { + "epoch": 1.753276123891962, + "grad_norm": 23.097993850708008, + "learning_rate": 2.07787312684673e-05, + "loss": 2.6851, + "step": 5640000 + }, + { + "epoch": 1.753431556172449, + "grad_norm": 20.818340301513672, + "learning_rate": 2.0776140730459183e-05, + "loss": 2.7256, + "step": 5640500 + }, + { + "epoch": 1.7535869884529358, + "grad_norm": 8.692002296447754, + "learning_rate": 2.077355019245107e-05, + "loss": 2.752, + "step": 5641000 + }, + { + "epoch": 1.7537424207334227, + "grad_norm": 8.67007827758789, + "learning_rate": 2.0770959654442957e-05, + "loss": 2.7288, + "step": 5641500 + }, + { + "epoch": 1.7538978530139095, + "grad_norm": 6.239678382873535, + "learning_rate": 2.076836911643484e-05, + "loss": 2.6673, + "step": 5642000 + }, + { + "epoch": 1.7540532852943964, + "grad_norm": 21.556039810180664, + "learning_rate": 2.0765778578426724e-05, + "loss": 2.7248, + "step": 5642500 + }, + { + "epoch": 1.7542087175748833, + "grad_norm": 9.015057563781738, + "learning_rate": 2.076318804041861e-05, + "loss": 2.731, + "step": 5643000 + }, + { + "epoch": 1.7543641498553701, + "grad_norm": 6.672528266906738, + "learning_rate": 2.07605975024105e-05, + "loss": 2.7114, + "step": 5643500 + }, + { + "epoch": 1.754519582135857, + "grad_norm": 9.654899597167969, + "learning_rate": 2.0758006964402383e-05, + "loss": 2.7449, + "step": 5644000 + }, + { + "epoch": 1.7546750144163439, + "grad_norm": 6.800428867340088, + "learning_rate": 2.0755416426394266e-05, + "loss": 2.7099, + "step": 5644500 + }, + { + "epoch": 1.7548304466968307, + "grad_norm": 9.419367790222168, + "learning_rate": 2.075282588838615e-05, + "loss": 2.7405, + "step": 5645000 + }, + { + "epoch": 1.7549858789773176, + "grad_norm": 27.581222534179688, + "learning_rate": 2.0750235350378037e-05, + "loss": 2.7595, + "step": 5645500 + }, + { + "epoch": 1.7551413112578045, + "grad_norm": 10.056479454040527, + "learning_rate": 2.0747644812369924e-05, + "loss": 2.76, + "step": 5646000 + }, + { + "epoch": 1.7552967435382916, + "grad_norm": 9.350549697875977, + "learning_rate": 2.0745054274361808e-05, + "loss": 2.7569, + "step": 5646500 + }, + { + "epoch": 1.7554521758187784, + "grad_norm": 10.534601211547852, + "learning_rate": 2.0742463736353695e-05, + "loss": 2.7135, + "step": 5647000 + }, + { + "epoch": 1.7556076080992653, + "grad_norm": 8.100284576416016, + "learning_rate": 2.073987319834558e-05, + "loss": 2.7578, + "step": 5647500 + }, + { + "epoch": 1.7557630403797522, + "grad_norm": 14.490961074829102, + "learning_rate": 2.0737282660337466e-05, + "loss": 2.7005, + "step": 5648000 + }, + { + "epoch": 1.755918472660239, + "grad_norm": 10.452899932861328, + "learning_rate": 2.073469212232935e-05, + "loss": 2.7418, + "step": 5648500 + }, + { + "epoch": 1.756073904940726, + "grad_norm": 7.355588436126709, + "learning_rate": 2.0732101584321237e-05, + "loss": 2.7621, + "step": 5649000 + }, + { + "epoch": 1.7562293372212128, + "grad_norm": 7.842201232910156, + "learning_rate": 2.072951104631312e-05, + "loss": 2.7504, + "step": 5649500 + }, + { + "epoch": 1.7563847695016996, + "grad_norm": 10.24954605102539, + "learning_rate": 2.0726920508305005e-05, + "loss": 2.7285, + "step": 5650000 + }, + { + "epoch": 1.7565402017821865, + "grad_norm": 32.18710708618164, + "learning_rate": 2.0724329970296892e-05, + "loss": 2.7572, + "step": 5650500 + }, + { + "epoch": 1.7566956340626734, + "grad_norm": 11.532466888427734, + "learning_rate": 2.072173943228878e-05, + "loss": 2.736, + "step": 5651000 + }, + { + "epoch": 1.7568510663431602, + "grad_norm": 9.092362403869629, + "learning_rate": 2.0719148894280663e-05, + "loss": 2.7456, + "step": 5651500 + }, + { + "epoch": 1.757006498623647, + "grad_norm": 7.873612403869629, + "learning_rate": 2.0716558356272547e-05, + "loss": 2.7125, + "step": 5652000 + }, + { + "epoch": 1.7571619309041342, + "grad_norm": 10.21078872680664, + "learning_rate": 2.0713967818264434e-05, + "loss": 2.765, + "step": 5652500 + }, + { + "epoch": 1.757317363184621, + "grad_norm": 11.409716606140137, + "learning_rate": 2.071137728025632e-05, + "loss": 2.7623, + "step": 5653000 + }, + { + "epoch": 1.757472795465108, + "grad_norm": 9.951282501220703, + "learning_rate": 2.0708786742248205e-05, + "loss": 2.7513, + "step": 5653500 + }, + { + "epoch": 1.7576282277455948, + "grad_norm": 9.600851058959961, + "learning_rate": 2.070619620424009e-05, + "loss": 2.734, + "step": 5654000 + }, + { + "epoch": 1.7577836600260817, + "grad_norm": 9.068460464477539, + "learning_rate": 2.0703605666231976e-05, + "loss": 2.705, + "step": 5654500 + }, + { + "epoch": 1.7579390923065685, + "grad_norm": 8.51336669921875, + "learning_rate": 2.070101512822386e-05, + "loss": 2.7129, + "step": 5655000 + }, + { + "epoch": 1.7580945245870554, + "grad_norm": 10.507108688354492, + "learning_rate": 2.0698424590215746e-05, + "loss": 2.706, + "step": 5655500 + }, + { + "epoch": 1.7582499568675423, + "grad_norm": 9.396469116210938, + "learning_rate": 2.0695834052207634e-05, + "loss": 2.7489, + "step": 5656000 + }, + { + "epoch": 1.7584053891480291, + "grad_norm": 20.42148208618164, + "learning_rate": 2.0693243514199517e-05, + "loss": 2.7823, + "step": 5656500 + }, + { + "epoch": 1.758560821428516, + "grad_norm": 12.622444152832031, + "learning_rate": 2.06906529761914e-05, + "loss": 2.7141, + "step": 5657000 + }, + { + "epoch": 1.7587162537090029, + "grad_norm": 9.487565040588379, + "learning_rate": 2.0688062438183288e-05, + "loss": 2.7144, + "step": 5657500 + }, + { + "epoch": 1.7588716859894897, + "grad_norm": 10.089287757873535, + "learning_rate": 2.0685471900175175e-05, + "loss": 2.7429, + "step": 5658000 + }, + { + "epoch": 1.7590271182699766, + "grad_norm": 8.781876564025879, + "learning_rate": 2.068288136216706e-05, + "loss": 2.7567, + "step": 5658500 + }, + { + "epoch": 1.7591825505504635, + "grad_norm": 9.586721420288086, + "learning_rate": 2.0680290824158943e-05, + "loss": 2.7683, + "step": 5659000 + }, + { + "epoch": 1.7593379828309503, + "grad_norm": 8.340873718261719, + "learning_rate": 2.067770028615083e-05, + "loss": 2.7001, + "step": 5659500 + }, + { + "epoch": 1.7594934151114372, + "grad_norm": 8.45920467376709, + "learning_rate": 2.0675109748142714e-05, + "loss": 2.741, + "step": 5660000 + }, + { + "epoch": 1.759648847391924, + "grad_norm": 10.103623390197754, + "learning_rate": 2.06725192101346e-05, + "loss": 2.7398, + "step": 5660500 + }, + { + "epoch": 1.759804279672411, + "grad_norm": 13.078770637512207, + "learning_rate": 2.0669928672126485e-05, + "loss": 2.7459, + "step": 5661000 + }, + { + "epoch": 1.7599597119528978, + "grad_norm": 11.374587059020996, + "learning_rate": 2.0667338134118372e-05, + "loss": 2.7361, + "step": 5661500 + }, + { + "epoch": 1.7601151442333847, + "grad_norm": 8.92498779296875, + "learning_rate": 2.0664747596110256e-05, + "loss": 2.7852, + "step": 5662000 + }, + { + "epoch": 1.7602705765138715, + "grad_norm": 8.387311935424805, + "learning_rate": 2.0662157058102143e-05, + "loss": 2.7368, + "step": 5662500 + }, + { + "epoch": 1.7604260087943584, + "grad_norm": 5.60782527923584, + "learning_rate": 2.0659566520094027e-05, + "loss": 2.7442, + "step": 5663000 + }, + { + "epoch": 1.7605814410748453, + "grad_norm": 9.990998268127441, + "learning_rate": 2.0656975982085914e-05, + "loss": 2.735, + "step": 5663500 + }, + { + "epoch": 1.7607368733553321, + "grad_norm": 9.061568260192871, + "learning_rate": 2.0654385444077798e-05, + "loss": 2.7202, + "step": 5664000 + }, + { + "epoch": 1.760892305635819, + "grad_norm": 11.50590991973877, + "learning_rate": 2.065179490606968e-05, + "loss": 2.7253, + "step": 5664500 + }, + { + "epoch": 1.7610477379163059, + "grad_norm": 9.612143516540527, + "learning_rate": 2.064920436806157e-05, + "loss": 2.7328, + "step": 5665000 + }, + { + "epoch": 1.7612031701967927, + "grad_norm": 10.345931053161621, + "learning_rate": 2.0646613830053456e-05, + "loss": 2.7031, + "step": 5665500 + }, + { + "epoch": 1.7613586024772796, + "grad_norm": 7.714984893798828, + "learning_rate": 2.064402329204534e-05, + "loss": 2.6583, + "step": 5666000 + }, + { + "epoch": 1.7615140347577665, + "grad_norm": 13.707921981811523, + "learning_rate": 2.0641432754037223e-05, + "loss": 2.7493, + "step": 5666500 + }, + { + "epoch": 1.7616694670382533, + "grad_norm": 10.207139015197754, + "learning_rate": 2.063884221602911e-05, + "loss": 2.71, + "step": 5667000 + }, + { + "epoch": 1.7618248993187402, + "grad_norm": 12.721758842468262, + "learning_rate": 2.0636251678020997e-05, + "loss": 2.6692, + "step": 5667500 + }, + { + "epoch": 1.761980331599227, + "grad_norm": 8.65423583984375, + "learning_rate": 2.063366114001288e-05, + "loss": 2.7048, + "step": 5668000 + }, + { + "epoch": 1.762135763879714, + "grad_norm": 8.937150001525879, + "learning_rate": 2.063107060200477e-05, + "loss": 2.7658, + "step": 5668500 + }, + { + "epoch": 1.7622911961602008, + "grad_norm": 9.393484115600586, + "learning_rate": 2.0628480063996652e-05, + "loss": 2.684, + "step": 5669000 + }, + { + "epoch": 1.7624466284406877, + "grad_norm": 9.037761688232422, + "learning_rate": 2.0625889525988536e-05, + "loss": 2.772, + "step": 5669500 + }, + { + "epoch": 1.7626020607211745, + "grad_norm": 9.021722793579102, + "learning_rate": 2.0623298987980423e-05, + "loss": 2.7213, + "step": 5670000 + }, + { + "epoch": 1.7627574930016616, + "grad_norm": 28.353342056274414, + "learning_rate": 2.062070844997231e-05, + "loss": 2.7422, + "step": 5670500 + }, + { + "epoch": 1.7629129252821485, + "grad_norm": 8.475739479064941, + "learning_rate": 2.0618117911964194e-05, + "loss": 2.7551, + "step": 5671000 + }, + { + "epoch": 1.7630683575626354, + "grad_norm": 8.137874603271484, + "learning_rate": 2.0615527373956078e-05, + "loss": 2.7182, + "step": 5671500 + }, + { + "epoch": 1.7632237898431222, + "grad_norm": 9.049293518066406, + "learning_rate": 2.061293683594796e-05, + "loss": 2.7134, + "step": 5672000 + }, + { + "epoch": 1.763379222123609, + "grad_norm": 14.16977310180664, + "learning_rate": 2.0610346297939852e-05, + "loss": 2.7578, + "step": 5672500 + }, + { + "epoch": 1.763534654404096, + "grad_norm": 11.145598411560059, + "learning_rate": 2.0607755759931736e-05, + "loss": 2.7317, + "step": 5673000 + }, + { + "epoch": 1.7636900866845828, + "grad_norm": 8.936454772949219, + "learning_rate": 2.060516522192362e-05, + "loss": 2.7802, + "step": 5673500 + }, + { + "epoch": 1.7638455189650697, + "grad_norm": 9.673921585083008, + "learning_rate": 2.0602574683915507e-05, + "loss": 2.7853, + "step": 5674000 + }, + { + "epoch": 1.7640009512455566, + "grad_norm": 9.40227222442627, + "learning_rate": 2.059998414590739e-05, + "loss": 2.7454, + "step": 5674500 + }, + { + "epoch": 1.7641563835260434, + "grad_norm": 9.418869972229004, + "learning_rate": 2.0597393607899278e-05, + "loss": 2.7119, + "step": 5675000 + }, + { + "epoch": 1.7643118158065303, + "grad_norm": 12.798643112182617, + "learning_rate": 2.059480306989116e-05, + "loss": 2.7098, + "step": 5675500 + }, + { + "epoch": 1.7644672480870172, + "grad_norm": 17.5849666595459, + "learning_rate": 2.059221253188305e-05, + "loss": 2.7463, + "step": 5676000 + }, + { + "epoch": 1.7646226803675042, + "grad_norm": 8.855607986450195, + "learning_rate": 2.0589621993874932e-05, + "loss": 2.722, + "step": 5676500 + }, + { + "epoch": 1.7647781126479911, + "grad_norm": 7.05081844329834, + "learning_rate": 2.0587031455866816e-05, + "loss": 2.6777, + "step": 5677000 + }, + { + "epoch": 1.764933544928478, + "grad_norm": 8.356974601745605, + "learning_rate": 2.0584440917858707e-05, + "loss": 2.6764, + "step": 5677500 + }, + { + "epoch": 1.7650889772089648, + "grad_norm": 12.467208862304688, + "learning_rate": 2.058185037985059e-05, + "loss": 2.7405, + "step": 5678000 + }, + { + "epoch": 1.7652444094894517, + "grad_norm": 7.888331413269043, + "learning_rate": 2.0579259841842474e-05, + "loss": 2.7462, + "step": 5678500 + }, + { + "epoch": 1.7653998417699386, + "grad_norm": 35.93060302734375, + "learning_rate": 2.0576669303834358e-05, + "loss": 2.7341, + "step": 5679000 + }, + { + "epoch": 1.7655552740504254, + "grad_norm": 11.844101905822754, + "learning_rate": 2.0574078765826245e-05, + "loss": 2.6936, + "step": 5679500 + }, + { + "epoch": 1.7657107063309123, + "grad_norm": 15.293746948242188, + "learning_rate": 2.0571488227818132e-05, + "loss": 2.7289, + "step": 5680000 + }, + { + "epoch": 1.7658661386113992, + "grad_norm": 10.201661109924316, + "learning_rate": 2.0568897689810016e-05, + "loss": 2.7221, + "step": 5680500 + }, + { + "epoch": 1.766021570891886, + "grad_norm": 16.978919982910156, + "learning_rate": 2.05663071518019e-05, + "loss": 2.7477, + "step": 5681000 + }, + { + "epoch": 1.766177003172373, + "grad_norm": 10.143819808959961, + "learning_rate": 2.0563716613793787e-05, + "loss": 2.7711, + "step": 5681500 + }, + { + "epoch": 1.7663324354528598, + "grad_norm": 9.517789840698242, + "learning_rate": 2.056112607578567e-05, + "loss": 2.7407, + "step": 5682000 + }, + { + "epoch": 1.7664878677333467, + "grad_norm": 12.172747611999512, + "learning_rate": 2.0558535537777558e-05, + "loss": 2.7181, + "step": 5682500 + }, + { + "epoch": 1.7666433000138335, + "grad_norm": 10.209077835083008, + "learning_rate": 2.0555944999769445e-05, + "loss": 2.6986, + "step": 5683000 + }, + { + "epoch": 1.7667987322943204, + "grad_norm": 15.975022315979004, + "learning_rate": 2.055335446176133e-05, + "loss": 2.6904, + "step": 5683500 + }, + { + "epoch": 1.7669541645748073, + "grad_norm": 34.82484817504883, + "learning_rate": 2.0550763923753213e-05, + "loss": 2.7124, + "step": 5684000 + }, + { + "epoch": 1.7671095968552941, + "grad_norm": 7.921019554138184, + "learning_rate": 2.05481733857451e-05, + "loss": 2.7609, + "step": 5684500 + }, + { + "epoch": 1.767265029135781, + "grad_norm": 6.247814655303955, + "learning_rate": 2.0545582847736987e-05, + "loss": 2.7298, + "step": 5685000 + }, + { + "epoch": 1.7674204614162679, + "grad_norm": 8.3722562789917, + "learning_rate": 2.054299230972887e-05, + "loss": 2.7145, + "step": 5685500 + }, + { + "epoch": 1.7675758936967547, + "grad_norm": 8.80297565460205, + "learning_rate": 2.0540401771720754e-05, + "loss": 2.7181, + "step": 5686000 + }, + { + "epoch": 1.7677313259772416, + "grad_norm": 9.903562545776367, + "learning_rate": 2.053781123371264e-05, + "loss": 2.7575, + "step": 5686500 + }, + { + "epoch": 1.7678867582577285, + "grad_norm": 8.362092971801758, + "learning_rate": 2.0535220695704525e-05, + "loss": 2.7266, + "step": 5687000 + }, + { + "epoch": 1.7680421905382153, + "grad_norm": 8.044750213623047, + "learning_rate": 2.0532630157696412e-05, + "loss": 2.7449, + "step": 5687500 + }, + { + "epoch": 1.7681976228187022, + "grad_norm": 8.775464057922363, + "learning_rate": 2.0530039619688296e-05, + "loss": 2.7044, + "step": 5688000 + }, + { + "epoch": 1.768353055099189, + "grad_norm": 9.84617805480957, + "learning_rate": 2.0527449081680183e-05, + "loss": 2.7853, + "step": 5688500 + }, + { + "epoch": 1.768508487379676, + "grad_norm": 10.292781829833984, + "learning_rate": 2.0524858543672067e-05, + "loss": 2.7579, + "step": 5689000 + }, + { + "epoch": 1.7686639196601628, + "grad_norm": 9.08936882019043, + "learning_rate": 2.0522268005663954e-05, + "loss": 2.728, + "step": 5689500 + }, + { + "epoch": 1.7688193519406497, + "grad_norm": 10.131477355957031, + "learning_rate": 2.0519677467655838e-05, + "loss": 2.7142, + "step": 5690000 + }, + { + "epoch": 1.7689747842211365, + "grad_norm": 9.30162239074707, + "learning_rate": 2.0517086929647725e-05, + "loss": 2.6939, + "step": 5690500 + }, + { + "epoch": 1.7691302165016234, + "grad_norm": 13.3618803024292, + "learning_rate": 2.051449639163961e-05, + "loss": 2.7736, + "step": 5691000 + }, + { + "epoch": 1.7692856487821103, + "grad_norm": 9.316285133361816, + "learning_rate": 2.0511905853631493e-05, + "loss": 2.6878, + "step": 5691500 + }, + { + "epoch": 1.7694410810625971, + "grad_norm": 17.562213897705078, + "learning_rate": 2.050931531562338e-05, + "loss": 2.744, + "step": 5692000 + }, + { + "epoch": 1.769596513343084, + "grad_norm": 7.608456611633301, + "learning_rate": 2.0506724777615267e-05, + "loss": 2.7123, + "step": 5692500 + }, + { + "epoch": 1.7697519456235709, + "grad_norm": 11.023627281188965, + "learning_rate": 2.050413423960715e-05, + "loss": 2.6883, + "step": 5693000 + }, + { + "epoch": 1.7699073779040577, + "grad_norm": 9.176066398620605, + "learning_rate": 2.0501543701599035e-05, + "loss": 2.7224, + "step": 5693500 + }, + { + "epoch": 1.7700628101845446, + "grad_norm": 32.545528411865234, + "learning_rate": 2.0498953163590922e-05, + "loss": 2.7734, + "step": 5694000 + }, + { + "epoch": 1.7702182424650317, + "grad_norm": 7.4569091796875, + "learning_rate": 2.049636262558281e-05, + "loss": 2.6942, + "step": 5694500 + }, + { + "epoch": 1.7703736747455185, + "grad_norm": 8.930628776550293, + "learning_rate": 2.0493772087574693e-05, + "loss": 2.7381, + "step": 5695000 + }, + { + "epoch": 1.7705291070260054, + "grad_norm": 10.259322166442871, + "learning_rate": 2.049118154956658e-05, + "loss": 2.736, + "step": 5695500 + }, + { + "epoch": 1.7706845393064923, + "grad_norm": 12.701292037963867, + "learning_rate": 2.0488591011558464e-05, + "loss": 2.7395, + "step": 5696000 + }, + { + "epoch": 1.7708399715869791, + "grad_norm": 9.654927253723145, + "learning_rate": 2.0486000473550347e-05, + "loss": 2.7165, + "step": 5696500 + }, + { + "epoch": 1.770995403867466, + "grad_norm": 9.034053802490234, + "learning_rate": 2.0483409935542234e-05, + "loss": 2.7265, + "step": 5697000 + }, + { + "epoch": 1.7711508361479529, + "grad_norm": 7.241479396820068, + "learning_rate": 2.048081939753412e-05, + "loss": 2.7446, + "step": 5697500 + }, + { + "epoch": 1.7713062684284397, + "grad_norm": 10.579947471618652, + "learning_rate": 2.0478228859526005e-05, + "loss": 2.7405, + "step": 5698000 + }, + { + "epoch": 1.7714617007089266, + "grad_norm": 8.31330394744873, + "learning_rate": 2.047563832151789e-05, + "loss": 2.7296, + "step": 5698500 + }, + { + "epoch": 1.7716171329894135, + "grad_norm": 7.5644755363464355, + "learning_rate": 2.0473047783509773e-05, + "loss": 2.7184, + "step": 5699000 + }, + { + "epoch": 1.7717725652699003, + "grad_norm": 10.398176193237305, + "learning_rate": 2.0470457245501663e-05, + "loss": 2.6877, + "step": 5699500 + }, + { + "epoch": 1.7719279975503872, + "grad_norm": 12.817828178405762, + "learning_rate": 2.0467866707493547e-05, + "loss": 2.7511, + "step": 5700000 + }, + { + "epoch": 1.7720834298308743, + "grad_norm": 11.131415367126465, + "learning_rate": 2.046527616948543e-05, + "loss": 2.7424, + "step": 5700500 + }, + { + "epoch": 1.7722388621113612, + "grad_norm": 12.903995513916016, + "learning_rate": 2.0462685631477318e-05, + "loss": 2.6979, + "step": 5701000 + }, + { + "epoch": 1.772394294391848, + "grad_norm": 10.721875190734863, + "learning_rate": 2.0460095093469202e-05, + "loss": 2.7503, + "step": 5701500 + }, + { + "epoch": 1.772549726672335, + "grad_norm": 13.049309730529785, + "learning_rate": 2.045750455546109e-05, + "loss": 2.6923, + "step": 5702000 + }, + { + "epoch": 1.7727051589528218, + "grad_norm": 8.421113967895508, + "learning_rate": 2.0454914017452973e-05, + "loss": 2.7203, + "step": 5702500 + }, + { + "epoch": 1.7728605912333086, + "grad_norm": 9.730867385864258, + "learning_rate": 2.045232347944486e-05, + "loss": 2.7709, + "step": 5703000 + }, + { + "epoch": 1.7730160235137955, + "grad_norm": 12.421927452087402, + "learning_rate": 2.0449732941436744e-05, + "loss": 2.7779, + "step": 5703500 + }, + { + "epoch": 1.7731714557942824, + "grad_norm": 10.412355422973633, + "learning_rate": 2.0447142403428628e-05, + "loss": 2.7474, + "step": 5704000 + }, + { + "epoch": 1.7733268880747692, + "grad_norm": 8.759136199951172, + "learning_rate": 2.0444551865420518e-05, + "loss": 2.7539, + "step": 5704500 + }, + { + "epoch": 1.773482320355256, + "grad_norm": 9.064653396606445, + "learning_rate": 2.0441961327412402e-05, + "loss": 2.7231, + "step": 5705000 + }, + { + "epoch": 1.773637752635743, + "grad_norm": 8.907818794250488, + "learning_rate": 2.0439370789404286e-05, + "loss": 2.7402, + "step": 5705500 + }, + { + "epoch": 1.7737931849162298, + "grad_norm": 5.63023042678833, + "learning_rate": 2.043678025139617e-05, + "loss": 2.7041, + "step": 5706000 + }, + { + "epoch": 1.7739486171967167, + "grad_norm": 10.04074478149414, + "learning_rate": 2.0434189713388057e-05, + "loss": 2.7486, + "step": 5706500 + }, + { + "epoch": 1.7741040494772036, + "grad_norm": 11.753114700317383, + "learning_rate": 2.0431599175379944e-05, + "loss": 2.763, + "step": 5707000 + }, + { + "epoch": 1.7742594817576904, + "grad_norm": 22.84893798828125, + "learning_rate": 2.0429008637371827e-05, + "loss": 2.7028, + "step": 5707500 + }, + { + "epoch": 1.7744149140381773, + "grad_norm": 12.10370922088623, + "learning_rate": 2.042641809936371e-05, + "loss": 2.7637, + "step": 5708000 + }, + { + "epoch": 1.7745703463186642, + "grad_norm": 9.083860397338867, + "learning_rate": 2.04238275613556e-05, + "loss": 2.7398, + "step": 5708500 + }, + { + "epoch": 1.774725778599151, + "grad_norm": 13.488946914672852, + "learning_rate": 2.0421237023347482e-05, + "loss": 2.7651, + "step": 5709000 + }, + { + "epoch": 1.774881210879638, + "grad_norm": 13.314909934997559, + "learning_rate": 2.041864648533937e-05, + "loss": 2.754, + "step": 5709500 + }, + { + "epoch": 1.7750366431601248, + "grad_norm": 8.217445373535156, + "learning_rate": 2.0416055947331256e-05, + "loss": 2.6974, + "step": 5710000 + }, + { + "epoch": 1.7751920754406116, + "grad_norm": 18.217208862304688, + "learning_rate": 2.041346540932314e-05, + "loss": 2.726, + "step": 5710500 + }, + { + "epoch": 1.7753475077210985, + "grad_norm": 8.194424629211426, + "learning_rate": 2.0410874871315024e-05, + "loss": 2.7261, + "step": 5711000 + }, + { + "epoch": 1.7755029400015854, + "grad_norm": 12.961714744567871, + "learning_rate": 2.040828433330691e-05, + "loss": 2.7073, + "step": 5711500 + }, + { + "epoch": 1.7756583722820722, + "grad_norm": 10.228477478027344, + "learning_rate": 2.0405693795298798e-05, + "loss": 2.7446, + "step": 5712000 + }, + { + "epoch": 1.775813804562559, + "grad_norm": 7.450662612915039, + "learning_rate": 2.0403103257290682e-05, + "loss": 2.7301, + "step": 5712500 + }, + { + "epoch": 1.775969236843046, + "grad_norm": 9.232902526855469, + "learning_rate": 2.0400512719282566e-05, + "loss": 2.6873, + "step": 5713000 + }, + { + "epoch": 1.7761246691235328, + "grad_norm": 9.744635581970215, + "learning_rate": 2.0397922181274453e-05, + "loss": 2.7641, + "step": 5713500 + }, + { + "epoch": 1.7762801014040197, + "grad_norm": 11.971370697021484, + "learning_rate": 2.0395331643266337e-05, + "loss": 2.7219, + "step": 5714000 + }, + { + "epoch": 1.7764355336845066, + "grad_norm": 8.056486129760742, + "learning_rate": 2.0392741105258224e-05, + "loss": 2.7272, + "step": 5714500 + }, + { + "epoch": 1.7765909659649934, + "grad_norm": 5.142429828643799, + "learning_rate": 2.0390150567250108e-05, + "loss": 2.7276, + "step": 5715000 + }, + { + "epoch": 1.7767463982454803, + "grad_norm": 9.235345840454102, + "learning_rate": 2.0387560029241995e-05, + "loss": 2.7272, + "step": 5715500 + }, + { + "epoch": 1.7769018305259672, + "grad_norm": 11.247243881225586, + "learning_rate": 2.038496949123388e-05, + "loss": 2.7327, + "step": 5716000 + }, + { + "epoch": 1.777057262806454, + "grad_norm": 9.437397003173828, + "learning_rate": 2.0382378953225766e-05, + "loss": 2.745, + "step": 5716500 + }, + { + "epoch": 1.777212695086941, + "grad_norm": 8.716401100158691, + "learning_rate": 2.0379788415217653e-05, + "loss": 2.7048, + "step": 5717000 + }, + { + "epoch": 1.7773681273674278, + "grad_norm": 9.449695587158203, + "learning_rate": 2.0377197877209537e-05, + "loss": 2.694, + "step": 5717500 + }, + { + "epoch": 1.7775235596479146, + "grad_norm": 8.681059837341309, + "learning_rate": 2.037460733920142e-05, + "loss": 2.7276, + "step": 5718000 + }, + { + "epoch": 1.7776789919284015, + "grad_norm": 10.261377334594727, + "learning_rate": 2.0372016801193304e-05, + "loss": 2.6962, + "step": 5718500 + }, + { + "epoch": 1.7778344242088886, + "grad_norm": 9.1791410446167, + "learning_rate": 2.036942626318519e-05, + "loss": 2.7416, + "step": 5719000 + }, + { + "epoch": 1.7779898564893755, + "grad_norm": 19.637605667114258, + "learning_rate": 2.036683572517708e-05, + "loss": 2.7116, + "step": 5719500 + }, + { + "epoch": 1.7781452887698623, + "grad_norm": 8.935803413391113, + "learning_rate": 2.0364245187168962e-05, + "loss": 2.6988, + "step": 5720000 + }, + { + "epoch": 1.7783007210503492, + "grad_norm": 13.043717384338379, + "learning_rate": 2.0361654649160846e-05, + "loss": 2.7392, + "step": 5720500 + }, + { + "epoch": 1.778456153330836, + "grad_norm": 9.273871421813965, + "learning_rate": 2.0359064111152733e-05, + "loss": 2.7415, + "step": 5721000 + }, + { + "epoch": 1.778611585611323, + "grad_norm": 15.193942070007324, + "learning_rate": 2.035647357314462e-05, + "loss": 2.6726, + "step": 5721500 + }, + { + "epoch": 1.7787670178918098, + "grad_norm": 7.2175493240356445, + "learning_rate": 2.0353883035136504e-05, + "loss": 2.7063, + "step": 5722000 + }, + { + "epoch": 1.7789224501722967, + "grad_norm": 13.403300285339355, + "learning_rate": 2.035129249712839e-05, + "loss": 2.7216, + "step": 5722500 + }, + { + "epoch": 1.7790778824527835, + "grad_norm": 10.819050788879395, + "learning_rate": 2.0348701959120275e-05, + "loss": 2.7038, + "step": 5723000 + }, + { + "epoch": 1.7792333147332704, + "grad_norm": 10.314406394958496, + "learning_rate": 2.034611142111216e-05, + "loss": 2.7482, + "step": 5723500 + }, + { + "epoch": 1.7793887470137573, + "grad_norm": 9.465991020202637, + "learning_rate": 2.0343520883104046e-05, + "loss": 2.7226, + "step": 5724000 + }, + { + "epoch": 1.7795441792942441, + "grad_norm": 11.292116165161133, + "learning_rate": 2.0340930345095933e-05, + "loss": 2.7815, + "step": 5724500 + }, + { + "epoch": 1.7796996115747312, + "grad_norm": 11.226669311523438, + "learning_rate": 2.0338339807087817e-05, + "loss": 2.7111, + "step": 5725000 + }, + { + "epoch": 1.779855043855218, + "grad_norm": 7.865079402923584, + "learning_rate": 2.03357492690797e-05, + "loss": 2.7408, + "step": 5725500 + }, + { + "epoch": 1.780010476135705, + "grad_norm": 8.242121696472168, + "learning_rate": 2.0333158731071588e-05, + "loss": 2.6871, + "step": 5726000 + }, + { + "epoch": 1.7801659084161918, + "grad_norm": 9.055038452148438, + "learning_rate": 2.0330568193063475e-05, + "loss": 2.6946, + "step": 5726500 + }, + { + "epoch": 1.7803213406966787, + "grad_norm": 10.410615921020508, + "learning_rate": 2.032797765505536e-05, + "loss": 2.7738, + "step": 5727000 + }, + { + "epoch": 1.7804767729771656, + "grad_norm": 9.461305618286133, + "learning_rate": 2.0325387117047242e-05, + "loss": 2.7543, + "step": 5727500 + }, + { + "epoch": 1.7806322052576524, + "grad_norm": 17.156570434570312, + "learning_rate": 2.032279657903913e-05, + "loss": 2.7292, + "step": 5728000 + }, + { + "epoch": 1.7807876375381393, + "grad_norm": 12.171445846557617, + "learning_rate": 2.0320206041031013e-05, + "loss": 2.7603, + "step": 5728500 + }, + { + "epoch": 1.7809430698186262, + "grad_norm": 16.95802116394043, + "learning_rate": 2.03176155030229e-05, + "loss": 2.7241, + "step": 5729000 + }, + { + "epoch": 1.781098502099113, + "grad_norm": 9.310752868652344, + "learning_rate": 2.0315024965014784e-05, + "loss": 2.7384, + "step": 5729500 + }, + { + "epoch": 1.7812539343796, + "grad_norm": 9.63293170928955, + "learning_rate": 2.031243442700667e-05, + "loss": 2.7547, + "step": 5730000 + }, + { + "epoch": 1.7814093666600868, + "grad_norm": 12.106781959533691, + "learning_rate": 2.0309843888998555e-05, + "loss": 2.7491, + "step": 5730500 + }, + { + "epoch": 1.7815647989405736, + "grad_norm": 9.019998550415039, + "learning_rate": 2.030725335099044e-05, + "loss": 2.7553, + "step": 5731000 + }, + { + "epoch": 1.7817202312210605, + "grad_norm": 9.251832962036133, + "learning_rate": 2.030466281298233e-05, + "loss": 2.7676, + "step": 5731500 + }, + { + "epoch": 1.7818756635015474, + "grad_norm": 9.696686744689941, + "learning_rate": 2.0302072274974213e-05, + "loss": 2.7676, + "step": 5732000 + }, + { + "epoch": 1.7820310957820342, + "grad_norm": 10.455595970153809, + "learning_rate": 2.0299481736966097e-05, + "loss": 2.6767, + "step": 5732500 + }, + { + "epoch": 1.782186528062521, + "grad_norm": 11.286972045898438, + "learning_rate": 2.029689119895798e-05, + "loss": 2.7479, + "step": 5733000 + }, + { + "epoch": 1.782341960343008, + "grad_norm": 11.703606605529785, + "learning_rate": 2.0294300660949868e-05, + "loss": 2.6981, + "step": 5733500 + }, + { + "epoch": 1.7824973926234948, + "grad_norm": 21.42058753967285, + "learning_rate": 2.0291710122941755e-05, + "loss": 2.7066, + "step": 5734000 + }, + { + "epoch": 1.7826528249039817, + "grad_norm": 15.553450584411621, + "learning_rate": 2.028911958493364e-05, + "loss": 2.7117, + "step": 5734500 + }, + { + "epoch": 1.7828082571844686, + "grad_norm": 8.737751007080078, + "learning_rate": 2.0286529046925526e-05, + "loss": 2.7046, + "step": 5735000 + }, + { + "epoch": 1.7829636894649554, + "grad_norm": 9.342211723327637, + "learning_rate": 2.028393850891741e-05, + "loss": 2.7181, + "step": 5735500 + }, + { + "epoch": 1.7831191217454423, + "grad_norm": 13.337345123291016, + "learning_rate": 2.0281347970909294e-05, + "loss": 2.6752, + "step": 5736000 + }, + { + "epoch": 1.7832745540259292, + "grad_norm": 6.825374126434326, + "learning_rate": 2.027875743290118e-05, + "loss": 2.7525, + "step": 5736500 + }, + { + "epoch": 1.783429986306416, + "grad_norm": 9.36738109588623, + "learning_rate": 2.0276166894893068e-05, + "loss": 2.708, + "step": 5737000 + }, + { + "epoch": 1.783585418586903, + "grad_norm": 8.543184280395508, + "learning_rate": 2.027357635688495e-05, + "loss": 2.6951, + "step": 5737500 + }, + { + "epoch": 1.7837408508673898, + "grad_norm": 8.867376327514648, + "learning_rate": 2.0270985818876835e-05, + "loss": 2.7542, + "step": 5738000 + }, + { + "epoch": 1.7838962831478766, + "grad_norm": 11.441325187683105, + "learning_rate": 2.0268395280868723e-05, + "loss": 2.711, + "step": 5738500 + }, + { + "epoch": 1.7840517154283635, + "grad_norm": 11.605770111083984, + "learning_rate": 2.026580474286061e-05, + "loss": 2.7348, + "step": 5739000 + }, + { + "epoch": 1.7842071477088504, + "grad_norm": 9.935285568237305, + "learning_rate": 2.0263214204852493e-05, + "loss": 2.6991, + "step": 5739500 + }, + { + "epoch": 1.7843625799893372, + "grad_norm": 7.626270771026611, + "learning_rate": 2.0260623666844377e-05, + "loss": 2.7084, + "step": 5740000 + }, + { + "epoch": 1.784518012269824, + "grad_norm": 9.540749549865723, + "learning_rate": 2.0258033128836264e-05, + "loss": 2.7526, + "step": 5740500 + }, + { + "epoch": 1.784673444550311, + "grad_norm": 11.00355339050293, + "learning_rate": 2.0255442590828148e-05, + "loss": 2.7214, + "step": 5741000 + }, + { + "epoch": 1.7848288768307978, + "grad_norm": 8.241467475891113, + "learning_rate": 2.0252852052820035e-05, + "loss": 2.7119, + "step": 5741500 + }, + { + "epoch": 1.7849843091112847, + "grad_norm": 14.110745429992676, + "learning_rate": 2.025026151481192e-05, + "loss": 2.7703, + "step": 5742000 + }, + { + "epoch": 1.7851397413917716, + "grad_norm": 8.66064453125, + "learning_rate": 2.0247670976803806e-05, + "loss": 2.7101, + "step": 5742500 + }, + { + "epoch": 1.7852951736722587, + "grad_norm": 12.64907169342041, + "learning_rate": 2.024508043879569e-05, + "loss": 2.7425, + "step": 5743000 + }, + { + "epoch": 1.7854506059527455, + "grad_norm": 9.870736122131348, + "learning_rate": 2.0242489900787577e-05, + "loss": 2.7459, + "step": 5743500 + }, + { + "epoch": 1.7856060382332324, + "grad_norm": 8.78262710571289, + "learning_rate": 2.0239899362779464e-05, + "loss": 2.7349, + "step": 5744000 + }, + { + "epoch": 1.7857614705137193, + "grad_norm": 16.65957260131836, + "learning_rate": 2.0237308824771348e-05, + "loss": 2.7245, + "step": 5744500 + }, + { + "epoch": 1.7859169027942061, + "grad_norm": 7.6246232986450195, + "learning_rate": 2.0234718286763232e-05, + "loss": 2.7591, + "step": 5745000 + }, + { + "epoch": 1.786072335074693, + "grad_norm": 8.142603874206543, + "learning_rate": 2.0232127748755116e-05, + "loss": 2.7087, + "step": 5745500 + }, + { + "epoch": 1.7862277673551799, + "grad_norm": 9.9512939453125, + "learning_rate": 2.0229537210747003e-05, + "loss": 2.7702, + "step": 5746000 + }, + { + "epoch": 1.7863831996356667, + "grad_norm": 7.38637638092041, + "learning_rate": 2.022694667273889e-05, + "loss": 2.7447, + "step": 5746500 + }, + { + "epoch": 1.7865386319161536, + "grad_norm": 14.935343742370605, + "learning_rate": 2.0224356134730774e-05, + "loss": 2.7606, + "step": 5747000 + }, + { + "epoch": 1.7866940641966405, + "grad_norm": 11.480167388916016, + "learning_rate": 2.0221765596722657e-05, + "loss": 2.7427, + "step": 5747500 + }, + { + "epoch": 1.7868494964771273, + "grad_norm": 7.870300769805908, + "learning_rate": 2.0219175058714545e-05, + "loss": 2.7256, + "step": 5748000 + }, + { + "epoch": 1.7870049287576142, + "grad_norm": 9.279096603393555, + "learning_rate": 2.0216584520706432e-05, + "loss": 2.7474, + "step": 5748500 + }, + { + "epoch": 1.7871603610381013, + "grad_norm": 20.030885696411133, + "learning_rate": 2.0213993982698315e-05, + "loss": 2.6978, + "step": 5749000 + }, + { + "epoch": 1.7873157933185881, + "grad_norm": 7.311225891113281, + "learning_rate": 2.0211403444690203e-05, + "loss": 2.7548, + "step": 5749500 + }, + { + "epoch": 1.787471225599075, + "grad_norm": 9.806154251098633, + "learning_rate": 2.0208812906682086e-05, + "loss": 2.714, + "step": 5750000 + }, + { + "epoch": 1.7876266578795619, + "grad_norm": 12.044170379638672, + "learning_rate": 2.020622236867397e-05, + "loss": 2.7228, + "step": 5750500 + }, + { + "epoch": 1.7877820901600487, + "grad_norm": 9.780577659606934, + "learning_rate": 2.0203631830665857e-05, + "loss": 2.7545, + "step": 5751000 + }, + { + "epoch": 1.7879375224405356, + "grad_norm": 14.945383071899414, + "learning_rate": 2.0201041292657744e-05, + "loss": 2.7704, + "step": 5751500 + }, + { + "epoch": 1.7880929547210225, + "grad_norm": 9.382939338684082, + "learning_rate": 2.0198450754649628e-05, + "loss": 2.7463, + "step": 5752000 + }, + { + "epoch": 1.7882483870015093, + "grad_norm": 8.615049362182617, + "learning_rate": 2.0195860216641512e-05, + "loss": 2.726, + "step": 5752500 + }, + { + "epoch": 1.7884038192819962, + "grad_norm": 9.157047271728516, + "learning_rate": 2.01932696786334e-05, + "loss": 2.7801, + "step": 5753000 + }, + { + "epoch": 1.788559251562483, + "grad_norm": 7.271677017211914, + "learning_rate": 2.0190679140625286e-05, + "loss": 2.7315, + "step": 5753500 + }, + { + "epoch": 1.78871468384297, + "grad_norm": 10.939412117004395, + "learning_rate": 2.018808860261717e-05, + "loss": 2.7497, + "step": 5754000 + }, + { + "epoch": 1.7888701161234568, + "grad_norm": 10.701611518859863, + "learning_rate": 2.0185498064609054e-05, + "loss": 2.7687, + "step": 5754500 + }, + { + "epoch": 1.7890255484039437, + "grad_norm": 11.652884483337402, + "learning_rate": 2.018290752660094e-05, + "loss": 2.761, + "step": 5755000 + }, + { + "epoch": 1.7891809806844305, + "grad_norm": 11.448012351989746, + "learning_rate": 2.0180316988592825e-05, + "loss": 2.7678, + "step": 5755500 + }, + { + "epoch": 1.7893364129649174, + "grad_norm": 13.542858123779297, + "learning_rate": 2.0177726450584712e-05, + "loss": 2.7203, + "step": 5756000 + }, + { + "epoch": 1.7894918452454043, + "grad_norm": 9.783933639526367, + "learning_rate": 2.0175135912576596e-05, + "loss": 2.7335, + "step": 5756500 + }, + { + "epoch": 1.7896472775258911, + "grad_norm": 7.235541820526123, + "learning_rate": 2.0172545374568483e-05, + "loss": 2.7577, + "step": 5757000 + }, + { + "epoch": 1.789802709806378, + "grad_norm": 9.354460716247559, + "learning_rate": 2.0169954836560367e-05, + "loss": 2.7414, + "step": 5757500 + }, + { + "epoch": 1.7899581420868649, + "grad_norm": 17.231210708618164, + "learning_rate": 2.0167364298552254e-05, + "loss": 2.7009, + "step": 5758000 + }, + { + "epoch": 1.7901135743673517, + "grad_norm": 8.570719718933105, + "learning_rate": 2.016477376054414e-05, + "loss": 2.7443, + "step": 5758500 + }, + { + "epoch": 1.7902690066478386, + "grad_norm": 12.85877513885498, + "learning_rate": 2.0162183222536025e-05, + "loss": 2.7622, + "step": 5759000 + }, + { + "epoch": 1.7904244389283255, + "grad_norm": 7.785243511199951, + "learning_rate": 2.015959268452791e-05, + "loss": 2.7692, + "step": 5759500 + }, + { + "epoch": 1.7905798712088123, + "grad_norm": 14.206023216247559, + "learning_rate": 2.0157002146519792e-05, + "loss": 2.7458, + "step": 5760000 + }, + { + "epoch": 1.7907353034892992, + "grad_norm": 9.163925170898438, + "learning_rate": 2.015441160851168e-05, + "loss": 2.7109, + "step": 5760500 + }, + { + "epoch": 1.790890735769786, + "grad_norm": 10.295646667480469, + "learning_rate": 2.0151821070503566e-05, + "loss": 2.7121, + "step": 5761000 + }, + { + "epoch": 1.791046168050273, + "grad_norm": 8.431660652160645, + "learning_rate": 2.014923053249545e-05, + "loss": 2.8029, + "step": 5761500 + }, + { + "epoch": 1.7912016003307598, + "grad_norm": 13.065950393676758, + "learning_rate": 2.0146639994487337e-05, + "loss": 2.6863, + "step": 5762000 + }, + { + "epoch": 1.7913570326112467, + "grad_norm": 11.097114562988281, + "learning_rate": 2.014404945647922e-05, + "loss": 2.7051, + "step": 5762500 + }, + { + "epoch": 1.7915124648917335, + "grad_norm": 8.609711647033691, + "learning_rate": 2.014145891847111e-05, + "loss": 2.7687, + "step": 5763000 + }, + { + "epoch": 1.7916678971722204, + "grad_norm": 8.003961563110352, + "learning_rate": 2.0138868380462992e-05, + "loss": 2.7141, + "step": 5763500 + }, + { + "epoch": 1.7918233294527073, + "grad_norm": 8.84726619720459, + "learning_rate": 2.013627784245488e-05, + "loss": 2.7236, + "step": 5764000 + }, + { + "epoch": 1.7919787617331941, + "grad_norm": 8.869169235229492, + "learning_rate": 2.0133687304446763e-05, + "loss": 2.6572, + "step": 5764500 + }, + { + "epoch": 1.792134194013681, + "grad_norm": 10.524043083190918, + "learning_rate": 2.0131096766438647e-05, + "loss": 2.7575, + "step": 5765000 + }, + { + "epoch": 1.7922896262941679, + "grad_norm": 7.57095193862915, + "learning_rate": 2.0128506228430534e-05, + "loss": 2.7152, + "step": 5765500 + }, + { + "epoch": 1.7924450585746547, + "grad_norm": 7.824375152587891, + "learning_rate": 2.012591569042242e-05, + "loss": 2.7135, + "step": 5766000 + }, + { + "epoch": 1.7926004908551416, + "grad_norm": 9.04682445526123, + "learning_rate": 2.0123325152414305e-05, + "loss": 2.7119, + "step": 5766500 + }, + { + "epoch": 1.7927559231356287, + "grad_norm": 10.94018268585205, + "learning_rate": 2.012073461440619e-05, + "loss": 2.7206, + "step": 5767000 + }, + { + "epoch": 1.7929113554161156, + "grad_norm": 9.95057487487793, + "learning_rate": 2.0118144076398076e-05, + "loss": 2.7202, + "step": 5767500 + }, + { + "epoch": 1.7930667876966024, + "grad_norm": 10.212307929992676, + "learning_rate": 2.0115553538389963e-05, + "loss": 2.737, + "step": 5768000 + }, + { + "epoch": 1.7932222199770893, + "grad_norm": 10.241365432739258, + "learning_rate": 2.0112963000381847e-05, + "loss": 2.6886, + "step": 5768500 + }, + { + "epoch": 1.7933776522575762, + "grad_norm": 34.06475067138672, + "learning_rate": 2.011037246237373e-05, + "loss": 2.6915, + "step": 5769000 + }, + { + "epoch": 1.793533084538063, + "grad_norm": 7.641523838043213, + "learning_rate": 2.0107781924365618e-05, + "loss": 2.6491, + "step": 5769500 + }, + { + "epoch": 1.79368851681855, + "grad_norm": 7.287962913513184, + "learning_rate": 2.01051913863575e-05, + "loss": 2.7564, + "step": 5770000 + }, + { + "epoch": 1.7938439490990368, + "grad_norm": 34.78261947631836, + "learning_rate": 2.010260084834939e-05, + "loss": 2.7302, + "step": 5770500 + }, + { + "epoch": 1.7939993813795236, + "grad_norm": 8.79026985168457, + "learning_rate": 2.0100010310341276e-05, + "loss": 2.7354, + "step": 5771000 + }, + { + "epoch": 1.7941548136600105, + "grad_norm": 9.65121078491211, + "learning_rate": 2.009741977233316e-05, + "loss": 2.7099, + "step": 5771500 + }, + { + "epoch": 1.7943102459404974, + "grad_norm": 8.652076721191406, + "learning_rate": 2.0094829234325043e-05, + "loss": 2.7546, + "step": 5772000 + }, + { + "epoch": 1.7944656782209842, + "grad_norm": 8.999741554260254, + "learning_rate": 2.0092238696316927e-05, + "loss": 2.7834, + "step": 5772500 + }, + { + "epoch": 1.7946211105014713, + "grad_norm": 20.520265579223633, + "learning_rate": 2.0089648158308818e-05, + "loss": 2.7023, + "step": 5773000 + }, + { + "epoch": 1.7947765427819582, + "grad_norm": 12.132678031921387, + "learning_rate": 2.00870576203007e-05, + "loss": 2.7499, + "step": 5773500 + }, + { + "epoch": 1.794931975062445, + "grad_norm": 6.615333557128906, + "learning_rate": 2.0084467082292585e-05, + "loss": 2.7357, + "step": 5774000 + }, + { + "epoch": 1.795087407342932, + "grad_norm": 11.619823455810547, + "learning_rate": 2.008187654428447e-05, + "loss": 2.7549, + "step": 5774500 + }, + { + "epoch": 1.7952428396234188, + "grad_norm": 16.76815414428711, + "learning_rate": 2.0079286006276356e-05, + "loss": 2.7972, + "step": 5775000 + }, + { + "epoch": 1.7953982719039057, + "grad_norm": 12.841267585754395, + "learning_rate": 2.0076695468268243e-05, + "loss": 2.7717, + "step": 5775500 + }, + { + "epoch": 1.7955537041843925, + "grad_norm": 10.023804664611816, + "learning_rate": 2.0074104930260127e-05, + "loss": 2.7347, + "step": 5776000 + }, + { + "epoch": 1.7957091364648794, + "grad_norm": 8.720477104187012, + "learning_rate": 2.0071514392252014e-05, + "loss": 2.7846, + "step": 5776500 + }, + { + "epoch": 1.7958645687453663, + "grad_norm": 8.86463737487793, + "learning_rate": 2.0068923854243898e-05, + "loss": 2.7635, + "step": 5777000 + }, + { + "epoch": 1.7960200010258531, + "grad_norm": 13.769062042236328, + "learning_rate": 2.006633331623578e-05, + "loss": 2.7395, + "step": 5777500 + }, + { + "epoch": 1.79617543330634, + "grad_norm": 9.721936225891113, + "learning_rate": 2.006374277822767e-05, + "loss": 2.7514, + "step": 5778000 + }, + { + "epoch": 1.7963308655868269, + "grad_norm": 9.495865821838379, + "learning_rate": 2.0061152240219556e-05, + "loss": 2.7398, + "step": 5778500 + }, + { + "epoch": 1.7964862978673137, + "grad_norm": 10.116159439086914, + "learning_rate": 2.005856170221144e-05, + "loss": 2.748, + "step": 5779000 + }, + { + "epoch": 1.7966417301478006, + "grad_norm": 8.798450469970703, + "learning_rate": 2.0055971164203323e-05, + "loss": 2.7066, + "step": 5779500 + }, + { + "epoch": 1.7967971624282875, + "grad_norm": 7.338807582855225, + "learning_rate": 2.005338062619521e-05, + "loss": 2.7216, + "step": 5780000 + }, + { + "epoch": 1.7969525947087743, + "grad_norm": 9.247177124023438, + "learning_rate": 2.0050790088187098e-05, + "loss": 2.7419, + "step": 5780500 + }, + { + "epoch": 1.7971080269892612, + "grad_norm": 8.495260238647461, + "learning_rate": 2.004819955017898e-05, + "loss": 2.7197, + "step": 5781000 + }, + { + "epoch": 1.797263459269748, + "grad_norm": 8.918952941894531, + "learning_rate": 2.0045609012170865e-05, + "loss": 2.7474, + "step": 5781500 + }, + { + "epoch": 1.797418891550235, + "grad_norm": 12.670991897583008, + "learning_rate": 2.0043018474162752e-05, + "loss": 2.7446, + "step": 5782000 + }, + { + "epoch": 1.7975743238307218, + "grad_norm": 9.382326126098633, + "learning_rate": 2.0040427936154636e-05, + "loss": 2.7156, + "step": 5782500 + }, + { + "epoch": 1.7977297561112087, + "grad_norm": 11.192821502685547, + "learning_rate": 2.0037837398146523e-05, + "loss": 2.7625, + "step": 5783000 + }, + { + "epoch": 1.7978851883916955, + "grad_norm": 26.657649993896484, + "learning_rate": 2.0035246860138407e-05, + "loss": 2.7151, + "step": 5783500 + }, + { + "epoch": 1.7980406206721824, + "grad_norm": 8.436371803283691, + "learning_rate": 2.0032656322130294e-05, + "loss": 2.7117, + "step": 5784000 + }, + { + "epoch": 1.7981960529526693, + "grad_norm": 11.320575714111328, + "learning_rate": 2.0030065784122178e-05, + "loss": 2.7539, + "step": 5784500 + }, + { + "epoch": 1.7983514852331561, + "grad_norm": 67.925537109375, + "learning_rate": 2.0027475246114065e-05, + "loss": 2.7025, + "step": 5785000 + }, + { + "epoch": 1.798506917513643, + "grad_norm": 22.198911666870117, + "learning_rate": 2.0024884708105952e-05, + "loss": 2.7401, + "step": 5785500 + }, + { + "epoch": 1.7986623497941299, + "grad_norm": 7.3809685707092285, + "learning_rate": 2.0022294170097836e-05, + "loss": 2.7228, + "step": 5786000 + }, + { + "epoch": 1.7988177820746167, + "grad_norm": 8.116350173950195, + "learning_rate": 2.001970363208972e-05, + "loss": 2.7512, + "step": 5786500 + }, + { + "epoch": 1.7989732143551036, + "grad_norm": 11.132631301879883, + "learning_rate": 2.0017113094081604e-05, + "loss": 2.7023, + "step": 5787000 + }, + { + "epoch": 1.7991286466355905, + "grad_norm": 10.092881202697754, + "learning_rate": 2.001452255607349e-05, + "loss": 2.716, + "step": 5787500 + }, + { + "epoch": 1.7992840789160773, + "grad_norm": 8.85791301727295, + "learning_rate": 2.0011932018065378e-05, + "loss": 2.7404, + "step": 5788000 + }, + { + "epoch": 1.7994395111965642, + "grad_norm": 12.453420639038086, + "learning_rate": 2.000934148005726e-05, + "loss": 2.7284, + "step": 5788500 + }, + { + "epoch": 1.799594943477051, + "grad_norm": 11.003256797790527, + "learning_rate": 2.000675094204915e-05, + "loss": 2.7138, + "step": 5789000 + }, + { + "epoch": 1.799750375757538, + "grad_norm": 8.504377365112305, + "learning_rate": 2.0004160404041033e-05, + "loss": 2.7333, + "step": 5789500 + }, + { + "epoch": 1.7999058080380248, + "grad_norm": 7.805610656738281, + "learning_rate": 2.000156986603292e-05, + "loss": 2.7283, + "step": 5790000 + }, + { + "epoch": 1.8000612403185117, + "grad_norm": 9.88372802734375, + "learning_rate": 1.9998979328024804e-05, + "loss": 2.7247, + "step": 5790500 + }, + { + "epoch": 1.8002166725989988, + "grad_norm": 9.314269065856934, + "learning_rate": 1.999638879001669e-05, + "loss": 2.7089, + "step": 5791000 + }, + { + "epoch": 1.8003721048794856, + "grad_norm": 11.077000617980957, + "learning_rate": 1.9993798252008574e-05, + "loss": 2.7555, + "step": 5791500 + }, + { + "epoch": 1.8005275371599725, + "grad_norm": 9.38348388671875, + "learning_rate": 1.9991207714000458e-05, + "loss": 2.7204, + "step": 5792000 + }, + { + "epoch": 1.8006829694404594, + "grad_norm": 13.04850959777832, + "learning_rate": 1.9988617175992345e-05, + "loss": 2.7149, + "step": 5792500 + }, + { + "epoch": 1.8008384017209462, + "grad_norm": 35.98662567138672, + "learning_rate": 1.9986026637984232e-05, + "loss": 2.7503, + "step": 5793000 + }, + { + "epoch": 1.800993834001433, + "grad_norm": 13.482525825500488, + "learning_rate": 1.9983436099976116e-05, + "loss": 2.7543, + "step": 5793500 + }, + { + "epoch": 1.80114926628192, + "grad_norm": 7.8856425285339355, + "learning_rate": 1.9980845561968e-05, + "loss": 2.7533, + "step": 5794000 + }, + { + "epoch": 1.8013046985624068, + "grad_norm": 22.00206756591797, + "learning_rate": 1.9978255023959887e-05, + "loss": 2.7538, + "step": 5794500 + }, + { + "epoch": 1.8014601308428937, + "grad_norm": 9.206548690795898, + "learning_rate": 1.9975664485951774e-05, + "loss": 2.699, + "step": 5795000 + }, + { + "epoch": 1.8016155631233806, + "grad_norm": 10.81515121459961, + "learning_rate": 1.9973073947943658e-05, + "loss": 2.7478, + "step": 5795500 + }, + { + "epoch": 1.8017709954038674, + "grad_norm": 6.810995578765869, + "learning_rate": 1.9970483409935542e-05, + "loss": 2.7511, + "step": 5796000 + }, + { + "epoch": 1.8019264276843543, + "grad_norm": 8.7907075881958, + "learning_rate": 1.996789287192743e-05, + "loss": 2.749, + "step": 5796500 + }, + { + "epoch": 1.8020818599648414, + "grad_norm": 13.816975593566895, + "learning_rate": 1.9965302333919313e-05, + "loss": 2.6956, + "step": 5797000 + }, + { + "epoch": 1.8022372922453282, + "grad_norm": 10.573467254638672, + "learning_rate": 1.99627117959112e-05, + "loss": 2.7289, + "step": 5797500 + }, + { + "epoch": 1.8023927245258151, + "grad_norm": 10.547255516052246, + "learning_rate": 1.9960121257903087e-05, + "loss": 2.7426, + "step": 5798000 + }, + { + "epoch": 1.802548156806302, + "grad_norm": 10.815384864807129, + "learning_rate": 1.995753071989497e-05, + "loss": 2.7546, + "step": 5798500 + }, + { + "epoch": 1.8027035890867888, + "grad_norm": 10.322466850280762, + "learning_rate": 1.9954940181886855e-05, + "loss": 2.7132, + "step": 5799000 + }, + { + "epoch": 1.8028590213672757, + "grad_norm": 9.273287773132324, + "learning_rate": 1.995234964387874e-05, + "loss": 2.7284, + "step": 5799500 + }, + { + "epoch": 1.8030144536477626, + "grad_norm": 19.23230743408203, + "learning_rate": 1.994975910587063e-05, + "loss": 2.7328, + "step": 5800000 + }, + { + "epoch": 1.8031698859282494, + "grad_norm": 14.638924598693848, + "learning_rate": 1.9947168567862513e-05, + "loss": 2.7876, + "step": 5800500 + }, + { + "epoch": 1.8033253182087363, + "grad_norm": 8.371914863586426, + "learning_rate": 1.9944578029854396e-05, + "loss": 2.7104, + "step": 5801000 + }, + { + "epoch": 1.8034807504892232, + "grad_norm": 6.960058212280273, + "learning_rate": 1.994198749184628e-05, + "loss": 2.7366, + "step": 5801500 + }, + { + "epoch": 1.80363618276971, + "grad_norm": 8.04061508178711, + "learning_rate": 1.9939396953838167e-05, + "loss": 2.7654, + "step": 5802000 + }, + { + "epoch": 1.803791615050197, + "grad_norm": 12.686299324035645, + "learning_rate": 1.9936806415830055e-05, + "loss": 2.67, + "step": 5802500 + }, + { + "epoch": 1.8039470473306838, + "grad_norm": 7.300060272216797, + "learning_rate": 1.9934215877821938e-05, + "loss": 2.7086, + "step": 5803000 + }, + { + "epoch": 1.8041024796111707, + "grad_norm": 11.267498970031738, + "learning_rate": 1.9931625339813825e-05, + "loss": 2.7035, + "step": 5803500 + }, + { + "epoch": 1.8042579118916575, + "grad_norm": 9.625798225402832, + "learning_rate": 1.992903480180571e-05, + "loss": 2.7797, + "step": 5804000 + }, + { + "epoch": 1.8044133441721444, + "grad_norm": 21.851442337036133, + "learning_rate": 1.9926444263797593e-05, + "loss": 2.7308, + "step": 5804500 + }, + { + "epoch": 1.8045687764526313, + "grad_norm": 12.79707145690918, + "learning_rate": 1.992385372578948e-05, + "loss": 2.7389, + "step": 5805000 + }, + { + "epoch": 1.8047242087331181, + "grad_norm": 9.793416976928711, + "learning_rate": 1.9921263187781367e-05, + "loss": 2.7434, + "step": 5805500 + }, + { + "epoch": 1.804879641013605, + "grad_norm": 39.678955078125, + "learning_rate": 1.991867264977325e-05, + "loss": 2.7547, + "step": 5806000 + }, + { + "epoch": 1.8050350732940919, + "grad_norm": 10.576752662658691, + "learning_rate": 1.9916082111765135e-05, + "loss": 2.7762, + "step": 5806500 + }, + { + "epoch": 1.8051905055745787, + "grad_norm": 9.069825172424316, + "learning_rate": 1.9913491573757022e-05, + "loss": 2.71, + "step": 5807000 + }, + { + "epoch": 1.8053459378550656, + "grad_norm": 9.461970329284668, + "learning_rate": 1.991090103574891e-05, + "loss": 2.7077, + "step": 5807500 + }, + { + "epoch": 1.8055013701355525, + "grad_norm": 11.786646842956543, + "learning_rate": 1.9908310497740793e-05, + "loss": 2.7337, + "step": 5808000 + }, + { + "epoch": 1.8056568024160393, + "grad_norm": 7.056237697601318, + "learning_rate": 1.9905719959732677e-05, + "loss": 2.7499, + "step": 5808500 + }, + { + "epoch": 1.8058122346965262, + "grad_norm": 12.4243745803833, + "learning_rate": 1.9903129421724564e-05, + "loss": 2.6994, + "step": 5809000 + }, + { + "epoch": 1.805967666977013, + "grad_norm": 7.42768669128418, + "learning_rate": 1.9900538883716448e-05, + "loss": 2.7291, + "step": 5809500 + }, + { + "epoch": 1.8061230992575, + "grad_norm": 10.238073348999023, + "learning_rate": 1.9897948345708335e-05, + "loss": 2.7548, + "step": 5810000 + }, + { + "epoch": 1.8062785315379868, + "grad_norm": 20.20711898803711, + "learning_rate": 1.989535780770022e-05, + "loss": 2.7566, + "step": 5810500 + }, + { + "epoch": 1.8064339638184737, + "grad_norm": 13.933663368225098, + "learning_rate": 1.9892767269692106e-05, + "loss": 2.7675, + "step": 5811000 + }, + { + "epoch": 1.8065893960989605, + "grad_norm": 10.265323638916016, + "learning_rate": 1.989017673168399e-05, + "loss": 2.7121, + "step": 5811500 + }, + { + "epoch": 1.8067448283794474, + "grad_norm": 9.245112419128418, + "learning_rate": 1.9887586193675877e-05, + "loss": 2.7349, + "step": 5812000 + }, + { + "epoch": 1.8069002606599343, + "grad_norm": 8.792658805847168, + "learning_rate": 1.9884995655667764e-05, + "loss": 2.7414, + "step": 5812500 + }, + { + "epoch": 1.8070556929404211, + "grad_norm": 9.545757293701172, + "learning_rate": 1.9882405117659647e-05, + "loss": 2.741, + "step": 5813000 + }, + { + "epoch": 1.807211125220908, + "grad_norm": 10.597321510314941, + "learning_rate": 1.987981457965153e-05, + "loss": 2.6853, + "step": 5813500 + }, + { + "epoch": 1.8073665575013949, + "grad_norm": 8.998087882995605, + "learning_rate": 1.9877224041643415e-05, + "loss": 2.7416, + "step": 5814000 + }, + { + "epoch": 1.8075219897818817, + "grad_norm": 7.1624345779418945, + "learning_rate": 1.9874633503635302e-05, + "loss": 2.7202, + "step": 5814500 + }, + { + "epoch": 1.8076774220623688, + "grad_norm": 11.500067710876465, + "learning_rate": 1.987204296562719e-05, + "loss": 2.7597, + "step": 5815000 + }, + { + "epoch": 1.8078328543428557, + "grad_norm": 8.843886375427246, + "learning_rate": 1.9869452427619073e-05, + "loss": 2.7468, + "step": 5815500 + }, + { + "epoch": 1.8079882866233425, + "grad_norm": 10.389167785644531, + "learning_rate": 1.986686188961096e-05, + "loss": 2.7656, + "step": 5816000 + }, + { + "epoch": 1.8081437189038294, + "grad_norm": 8.350046157836914, + "learning_rate": 1.9864271351602844e-05, + "loss": 2.7021, + "step": 5816500 + }, + { + "epoch": 1.8082991511843163, + "grad_norm": 8.903441429138184, + "learning_rate": 1.986168081359473e-05, + "loss": 2.7247, + "step": 5817000 + }, + { + "epoch": 1.8084545834648031, + "grad_norm": 11.524687767028809, + "learning_rate": 1.9859090275586615e-05, + "loss": 2.7051, + "step": 5817500 + }, + { + "epoch": 1.80861001574529, + "grad_norm": 23.744089126586914, + "learning_rate": 1.9856499737578502e-05, + "loss": 2.7349, + "step": 5818000 + }, + { + "epoch": 1.8087654480257769, + "grad_norm": 11.34437084197998, + "learning_rate": 1.9853909199570386e-05, + "loss": 2.7317, + "step": 5818500 + }, + { + "epoch": 1.8089208803062637, + "grad_norm": 10.650154113769531, + "learning_rate": 1.985131866156227e-05, + "loss": 2.7605, + "step": 5819000 + }, + { + "epoch": 1.8090763125867506, + "grad_norm": 15.107643127441406, + "learning_rate": 1.9848728123554157e-05, + "loss": 2.7182, + "step": 5819500 + }, + { + "epoch": 1.8092317448672375, + "grad_norm": 9.913867950439453, + "learning_rate": 1.9846137585546044e-05, + "loss": 2.7664, + "step": 5820000 + }, + { + "epoch": 1.8093871771477243, + "grad_norm": 9.38019847869873, + "learning_rate": 1.9843547047537928e-05, + "loss": 2.7636, + "step": 5820500 + }, + { + "epoch": 1.8095426094282114, + "grad_norm": 9.565056800842285, + "learning_rate": 1.984095650952981e-05, + "loss": 2.6942, + "step": 5821000 + }, + { + "epoch": 1.8096980417086983, + "grad_norm": 9.292051315307617, + "learning_rate": 1.98383659715217e-05, + "loss": 2.7567, + "step": 5821500 + }, + { + "epoch": 1.8098534739891852, + "grad_norm": 17.178701400756836, + "learning_rate": 1.9835775433513586e-05, + "loss": 2.7152, + "step": 5822000 + }, + { + "epoch": 1.810008906269672, + "grad_norm": 10.60623550415039, + "learning_rate": 1.983318489550547e-05, + "loss": 2.7479, + "step": 5822500 + }, + { + "epoch": 1.810164338550159, + "grad_norm": 7.428682327270508, + "learning_rate": 1.9830594357497353e-05, + "loss": 2.7464, + "step": 5823000 + }, + { + "epoch": 1.8103197708306458, + "grad_norm": 7.832756519317627, + "learning_rate": 1.982800381948924e-05, + "loss": 2.7042, + "step": 5823500 + }, + { + "epoch": 1.8104752031111326, + "grad_norm": 15.026590347290039, + "learning_rate": 1.9825413281481124e-05, + "loss": 2.7437, + "step": 5824000 + }, + { + "epoch": 1.8106306353916195, + "grad_norm": 6.487035274505615, + "learning_rate": 1.982282274347301e-05, + "loss": 2.7365, + "step": 5824500 + }, + { + "epoch": 1.8107860676721064, + "grad_norm": 9.786476135253906, + "learning_rate": 1.98202322054649e-05, + "loss": 2.7407, + "step": 5825000 + }, + { + "epoch": 1.8109414999525932, + "grad_norm": 10.403411865234375, + "learning_rate": 1.9817641667456782e-05, + "loss": 2.6611, + "step": 5825500 + }, + { + "epoch": 1.81109693223308, + "grad_norm": 7.865967273712158, + "learning_rate": 1.9815051129448666e-05, + "loss": 2.7172, + "step": 5826000 + }, + { + "epoch": 1.811252364513567, + "grad_norm": 10.785491943359375, + "learning_rate": 1.981246059144055e-05, + "loss": 2.7178, + "step": 5826500 + }, + { + "epoch": 1.8114077967940538, + "grad_norm": 11.011552810668945, + "learning_rate": 1.980987005343244e-05, + "loss": 2.6913, + "step": 5827000 + }, + { + "epoch": 1.8115632290745407, + "grad_norm": 7.9087042808532715, + "learning_rate": 1.9807279515424324e-05, + "loss": 2.6999, + "step": 5827500 + }, + { + "epoch": 1.8117186613550276, + "grad_norm": 12.539971351623535, + "learning_rate": 1.9804688977416208e-05, + "loss": 2.7672, + "step": 5828000 + }, + { + "epoch": 1.8118740936355144, + "grad_norm": 8.245692253112793, + "learning_rate": 1.980209843940809e-05, + "loss": 2.7676, + "step": 5828500 + }, + { + "epoch": 1.8120295259160013, + "grad_norm": 10.172515869140625, + "learning_rate": 1.979950790139998e-05, + "loss": 2.7254, + "step": 5829000 + }, + { + "epoch": 1.8121849581964882, + "grad_norm": 9.621429443359375, + "learning_rate": 1.9796917363391866e-05, + "loss": 2.7087, + "step": 5829500 + }, + { + "epoch": 1.812340390476975, + "grad_norm": 16.492969512939453, + "learning_rate": 1.979432682538375e-05, + "loss": 2.7083, + "step": 5830000 + }, + { + "epoch": 1.812495822757462, + "grad_norm": 9.958494186401367, + "learning_rate": 1.9791736287375637e-05, + "loss": 2.73, + "step": 5830500 + }, + { + "epoch": 1.8126512550379488, + "grad_norm": 9.955041885375977, + "learning_rate": 1.978914574936752e-05, + "loss": 2.7092, + "step": 5831000 + }, + { + "epoch": 1.8128066873184356, + "grad_norm": 14.078951835632324, + "learning_rate": 1.9786555211359404e-05, + "loss": 2.6801, + "step": 5831500 + }, + { + "epoch": 1.8129621195989225, + "grad_norm": 11.522712707519531, + "learning_rate": 1.978396467335129e-05, + "loss": 2.6972, + "step": 5832000 + }, + { + "epoch": 1.8131175518794094, + "grad_norm": 10.571388244628906, + "learning_rate": 1.978137413534318e-05, + "loss": 2.7734, + "step": 5832500 + }, + { + "epoch": 1.8132729841598962, + "grad_norm": 16.419937133789062, + "learning_rate": 1.9778783597335062e-05, + "loss": 2.7335, + "step": 5833000 + }, + { + "epoch": 1.813428416440383, + "grad_norm": 9.626447677612305, + "learning_rate": 1.9776193059326946e-05, + "loss": 2.7314, + "step": 5833500 + }, + { + "epoch": 1.81358384872087, + "grad_norm": 10.291988372802734, + "learning_rate": 1.9773602521318833e-05, + "loss": 2.7185, + "step": 5834000 + }, + { + "epoch": 1.8137392810013568, + "grad_norm": 8.285040855407715, + "learning_rate": 1.977101198331072e-05, + "loss": 2.7325, + "step": 5834500 + }, + { + "epoch": 1.8138947132818437, + "grad_norm": 9.932440757751465, + "learning_rate": 1.9768421445302604e-05, + "loss": 2.6692, + "step": 5835000 + }, + { + "epoch": 1.8140501455623306, + "grad_norm": 11.207313537597656, + "learning_rate": 1.9765830907294488e-05, + "loss": 2.7056, + "step": 5835500 + }, + { + "epoch": 1.8142055778428174, + "grad_norm": 8.094535827636719, + "learning_rate": 1.9763240369286375e-05, + "loss": 2.7, + "step": 5836000 + }, + { + "epoch": 1.8143610101233043, + "grad_norm": 10.066251754760742, + "learning_rate": 1.976064983127826e-05, + "loss": 2.7147, + "step": 5836500 + }, + { + "epoch": 1.8145164424037912, + "grad_norm": 19.379730224609375, + "learning_rate": 1.9758059293270146e-05, + "loss": 2.6913, + "step": 5837000 + }, + { + "epoch": 1.814671874684278, + "grad_norm": 11.610956192016602, + "learning_rate": 1.975546875526203e-05, + "loss": 2.7317, + "step": 5837500 + }, + { + "epoch": 1.814827306964765, + "grad_norm": 9.121953010559082, + "learning_rate": 1.9752878217253917e-05, + "loss": 2.725, + "step": 5838000 + }, + { + "epoch": 1.8149827392452518, + "grad_norm": 9.064462661743164, + "learning_rate": 1.97502876792458e-05, + "loss": 2.7169, + "step": 5838500 + }, + { + "epoch": 1.8151381715257389, + "grad_norm": 9.538369178771973, + "learning_rate": 1.9747697141237688e-05, + "loss": 2.722, + "step": 5839000 + }, + { + "epoch": 1.8152936038062257, + "grad_norm": 8.68166732788086, + "learning_rate": 1.9745106603229575e-05, + "loss": 2.7212, + "step": 5839500 + }, + { + "epoch": 1.8154490360867126, + "grad_norm": 11.63391399383545, + "learning_rate": 1.974251606522146e-05, + "loss": 2.74, + "step": 5840000 + }, + { + "epoch": 1.8156044683671995, + "grad_norm": 8.321126937866211, + "learning_rate": 1.9739925527213343e-05, + "loss": 2.7229, + "step": 5840500 + }, + { + "epoch": 1.8157599006476863, + "grad_norm": 10.647391319274902, + "learning_rate": 1.9737334989205226e-05, + "loss": 2.7517, + "step": 5841000 + }, + { + "epoch": 1.8159153329281732, + "grad_norm": 9.870789527893066, + "learning_rate": 1.9734744451197114e-05, + "loss": 2.7049, + "step": 5841500 + }, + { + "epoch": 1.81607076520866, + "grad_norm": 9.967689514160156, + "learning_rate": 1.9732153913189e-05, + "loss": 2.7163, + "step": 5842000 + }, + { + "epoch": 1.816226197489147, + "grad_norm": 12.449612617492676, + "learning_rate": 1.9729563375180884e-05, + "loss": 2.672, + "step": 5842500 + }, + { + "epoch": 1.8163816297696338, + "grad_norm": 10.700045585632324, + "learning_rate": 1.972697283717277e-05, + "loss": 2.749, + "step": 5843000 + }, + { + "epoch": 1.8165370620501207, + "grad_norm": 15.986172676086426, + "learning_rate": 1.9724382299164655e-05, + "loss": 2.7615, + "step": 5843500 + }, + { + "epoch": 1.8166924943306075, + "grad_norm": 7.7555766105651855, + "learning_rate": 1.9721791761156543e-05, + "loss": 2.7361, + "step": 5844000 + }, + { + "epoch": 1.8168479266110944, + "grad_norm": 8.937047004699707, + "learning_rate": 1.9719201223148426e-05, + "loss": 2.7099, + "step": 5844500 + }, + { + "epoch": 1.8170033588915813, + "grad_norm": 11.132699012756348, + "learning_rate": 1.9716610685140313e-05, + "loss": 2.7029, + "step": 5845000 + }, + { + "epoch": 1.8171587911720684, + "grad_norm": 10.166584968566895, + "learning_rate": 1.9714020147132197e-05, + "loss": 2.7386, + "step": 5845500 + }, + { + "epoch": 1.8173142234525552, + "grad_norm": 14.072752952575684, + "learning_rate": 1.971142960912408e-05, + "loss": 2.7227, + "step": 5846000 + }, + { + "epoch": 1.817469655733042, + "grad_norm": 10.219608306884766, + "learning_rate": 1.9708839071115968e-05, + "loss": 2.7149, + "step": 5846500 + }, + { + "epoch": 1.817625088013529, + "grad_norm": 10.67098617553711, + "learning_rate": 1.9706248533107855e-05, + "loss": 2.7262, + "step": 5847000 + }, + { + "epoch": 1.8177805202940158, + "grad_norm": 24.3519287109375, + "learning_rate": 1.970365799509974e-05, + "loss": 2.7469, + "step": 5847500 + }, + { + "epoch": 1.8179359525745027, + "grad_norm": 10.770756721496582, + "learning_rate": 1.9701067457091623e-05, + "loss": 2.7118, + "step": 5848000 + }, + { + "epoch": 1.8180913848549896, + "grad_norm": 9.496858596801758, + "learning_rate": 1.969847691908351e-05, + "loss": 2.7682, + "step": 5848500 + }, + { + "epoch": 1.8182468171354764, + "grad_norm": 10.074346542358398, + "learning_rate": 1.9695886381075397e-05, + "loss": 2.7152, + "step": 5849000 + }, + { + "epoch": 1.8184022494159633, + "grad_norm": 10.487322807312012, + "learning_rate": 1.969329584306728e-05, + "loss": 2.7333, + "step": 5849500 + }, + { + "epoch": 1.8185576816964502, + "grad_norm": 13.639436721801758, + "learning_rate": 1.9690705305059165e-05, + "loss": 2.7007, + "step": 5850000 + }, + { + "epoch": 1.818713113976937, + "grad_norm": 9.717787742614746, + "learning_rate": 1.9688114767051052e-05, + "loss": 2.7299, + "step": 5850500 + }, + { + "epoch": 1.818868546257424, + "grad_norm": 6.436766147613525, + "learning_rate": 1.9685524229042936e-05, + "loss": 2.6934, + "step": 5851000 + }, + { + "epoch": 1.8190239785379108, + "grad_norm": 7.9500861167907715, + "learning_rate": 1.9682933691034823e-05, + "loss": 2.7149, + "step": 5851500 + }, + { + "epoch": 1.8191794108183976, + "grad_norm": 14.141151428222656, + "learning_rate": 1.968034315302671e-05, + "loss": 2.7217, + "step": 5852000 + }, + { + "epoch": 1.8193348430988845, + "grad_norm": 7.8002495765686035, + "learning_rate": 1.9677752615018594e-05, + "loss": 2.7577, + "step": 5852500 + }, + { + "epoch": 1.8194902753793714, + "grad_norm": 8.621526718139648, + "learning_rate": 1.9675162077010477e-05, + "loss": 2.6843, + "step": 5853000 + }, + { + "epoch": 1.8196457076598582, + "grad_norm": 9.22177505493164, + "learning_rate": 1.9672571539002365e-05, + "loss": 2.7057, + "step": 5853500 + }, + { + "epoch": 1.819801139940345, + "grad_norm": 14.181818008422852, + "learning_rate": 1.9669981000994252e-05, + "loss": 2.7079, + "step": 5854000 + }, + { + "epoch": 1.819956572220832, + "grad_norm": 7.233830451965332, + "learning_rate": 1.9667390462986136e-05, + "loss": 2.7086, + "step": 5854500 + }, + { + "epoch": 1.8201120045013188, + "grad_norm": 13.433783531188965, + "learning_rate": 1.966479992497802e-05, + "loss": 2.7567, + "step": 5855000 + }, + { + "epoch": 1.8202674367818057, + "grad_norm": 10.553604125976562, + "learning_rate": 1.9662209386969906e-05, + "loss": 2.795, + "step": 5855500 + }, + { + "epoch": 1.8204228690622926, + "grad_norm": 7.674769878387451, + "learning_rate": 1.965961884896179e-05, + "loss": 2.734, + "step": 5856000 + }, + { + "epoch": 1.8205783013427794, + "grad_norm": 13.471614837646484, + "learning_rate": 1.9657028310953677e-05, + "loss": 2.7278, + "step": 5856500 + }, + { + "epoch": 1.8207337336232663, + "grad_norm": 11.486950874328613, + "learning_rate": 1.965443777294556e-05, + "loss": 2.732, + "step": 5857000 + }, + { + "epoch": 1.8208891659037532, + "grad_norm": 7.702648639678955, + "learning_rate": 1.9651847234937448e-05, + "loss": 2.7746, + "step": 5857500 + }, + { + "epoch": 1.82104459818424, + "grad_norm": 11.466458320617676, + "learning_rate": 1.9649256696929332e-05, + "loss": 2.7108, + "step": 5858000 + }, + { + "epoch": 1.821200030464727, + "grad_norm": 7.9202494621276855, + "learning_rate": 1.964666615892122e-05, + "loss": 2.7801, + "step": 5858500 + }, + { + "epoch": 1.8213554627452138, + "grad_norm": 8.328402519226074, + "learning_rate": 1.9644075620913103e-05, + "loss": 2.7484, + "step": 5859000 + }, + { + "epoch": 1.8215108950257006, + "grad_norm": 9.883598327636719, + "learning_rate": 1.964148508290499e-05, + "loss": 2.7016, + "step": 5859500 + }, + { + "epoch": 1.8216663273061875, + "grad_norm": 8.169254302978516, + "learning_rate": 1.9638894544896874e-05, + "loss": 2.7396, + "step": 5860000 + }, + { + "epoch": 1.8218217595866744, + "grad_norm": 13.96530818939209, + "learning_rate": 1.9636304006888758e-05, + "loss": 2.756, + "step": 5860500 + }, + { + "epoch": 1.8219771918671612, + "grad_norm": 11.498729705810547, + "learning_rate": 1.9633713468880645e-05, + "loss": 2.7, + "step": 5861000 + }, + { + "epoch": 1.822132624147648, + "grad_norm": 13.058511734008789, + "learning_rate": 1.9631122930872532e-05, + "loss": 2.7297, + "step": 5861500 + }, + { + "epoch": 1.822288056428135, + "grad_norm": 8.87977409362793, + "learning_rate": 1.9628532392864416e-05, + "loss": 2.7256, + "step": 5862000 + }, + { + "epoch": 1.8224434887086218, + "grad_norm": 8.673301696777344, + "learning_rate": 1.96259418548563e-05, + "loss": 2.7211, + "step": 5862500 + }, + { + "epoch": 1.8225989209891087, + "grad_norm": 8.785576820373535, + "learning_rate": 1.9623351316848187e-05, + "loss": 2.7182, + "step": 5863000 + }, + { + "epoch": 1.8227543532695958, + "grad_norm": 12.663071632385254, + "learning_rate": 1.9620760778840074e-05, + "loss": 2.7432, + "step": 5863500 + }, + { + "epoch": 1.8229097855500827, + "grad_norm": 10.6317777633667, + "learning_rate": 1.9618170240831958e-05, + "loss": 2.7452, + "step": 5864000 + }, + { + "epoch": 1.8230652178305695, + "grad_norm": 14.363005638122559, + "learning_rate": 1.9615579702823845e-05, + "loss": 2.7106, + "step": 5864500 + }, + { + "epoch": 1.8232206501110564, + "grad_norm": 10.623077392578125, + "learning_rate": 1.961298916481573e-05, + "loss": 2.7304, + "step": 5865000 + }, + { + "epoch": 1.8233760823915433, + "grad_norm": 7.909615516662598, + "learning_rate": 1.9610398626807612e-05, + "loss": 2.7109, + "step": 5865500 + }, + { + "epoch": 1.8235315146720301, + "grad_norm": 8.068971633911133, + "learning_rate": 1.96078080887995e-05, + "loss": 2.7282, + "step": 5866000 + }, + { + "epoch": 1.823686946952517, + "grad_norm": 10.034185409545898, + "learning_rate": 1.9605217550791387e-05, + "loss": 2.7385, + "step": 5866500 + }, + { + "epoch": 1.8238423792330039, + "grad_norm": 9.971844673156738, + "learning_rate": 1.960262701278327e-05, + "loss": 2.7676, + "step": 5867000 + }, + { + "epoch": 1.8239978115134907, + "grad_norm": 9.033068656921387, + "learning_rate": 1.9600036474775154e-05, + "loss": 2.7208, + "step": 5867500 + }, + { + "epoch": 1.8241532437939776, + "grad_norm": 12.547245025634766, + "learning_rate": 1.9597445936767038e-05, + "loss": 2.7349, + "step": 5868000 + }, + { + "epoch": 1.8243086760744645, + "grad_norm": 9.851085662841797, + "learning_rate": 1.959485539875893e-05, + "loss": 2.6834, + "step": 5868500 + }, + { + "epoch": 1.8244641083549513, + "grad_norm": 9.24326229095459, + "learning_rate": 1.9592264860750812e-05, + "loss": 2.6961, + "step": 5869000 + }, + { + "epoch": 1.8246195406354384, + "grad_norm": 9.352660179138184, + "learning_rate": 1.9589674322742696e-05, + "loss": 2.6572, + "step": 5869500 + }, + { + "epoch": 1.8247749729159253, + "grad_norm": 18.915674209594727, + "learning_rate": 1.9587083784734583e-05, + "loss": 2.7188, + "step": 5870000 + }, + { + "epoch": 1.8249304051964121, + "grad_norm": 8.068860054016113, + "learning_rate": 1.9584493246726467e-05, + "loss": 2.6974, + "step": 5870500 + }, + { + "epoch": 1.825085837476899, + "grad_norm": 10.830252647399902, + "learning_rate": 1.9581902708718354e-05, + "loss": 2.6761, + "step": 5871000 + }, + { + "epoch": 1.8252412697573859, + "grad_norm": 15.004719734191895, + "learning_rate": 1.9579312170710238e-05, + "loss": 2.7348, + "step": 5871500 + }, + { + "epoch": 1.8253967020378727, + "grad_norm": 25.78816032409668, + "learning_rate": 1.9576721632702125e-05, + "loss": 2.7379, + "step": 5872000 + }, + { + "epoch": 1.8255521343183596, + "grad_norm": 12.325338363647461, + "learning_rate": 1.957413109469401e-05, + "loss": 2.7176, + "step": 5872500 + }, + { + "epoch": 1.8257075665988465, + "grad_norm": 8.60894775390625, + "learning_rate": 1.9571540556685892e-05, + "loss": 2.7467, + "step": 5873000 + }, + { + "epoch": 1.8258629988793333, + "grad_norm": 9.142746925354004, + "learning_rate": 1.9568950018677783e-05, + "loss": 2.6935, + "step": 5873500 + }, + { + "epoch": 1.8260184311598202, + "grad_norm": 11.645672798156738, + "learning_rate": 1.9566359480669667e-05, + "loss": 2.7524, + "step": 5874000 + }, + { + "epoch": 1.826173863440307, + "grad_norm": 10.066400527954102, + "learning_rate": 1.956376894266155e-05, + "loss": 2.7271, + "step": 5874500 + }, + { + "epoch": 1.826329295720794, + "grad_norm": 8.946792602539062, + "learning_rate": 1.9561178404653434e-05, + "loss": 2.7265, + "step": 5875000 + }, + { + "epoch": 1.8264847280012808, + "grad_norm": 10.056578636169434, + "learning_rate": 1.955858786664532e-05, + "loss": 2.7225, + "step": 5875500 + }, + { + "epoch": 1.8266401602817677, + "grad_norm": 16.862716674804688, + "learning_rate": 1.955599732863721e-05, + "loss": 2.7327, + "step": 5876000 + }, + { + "epoch": 1.8267955925622545, + "grad_norm": 9.436203956604004, + "learning_rate": 1.9553406790629092e-05, + "loss": 2.7277, + "step": 5876500 + }, + { + "epoch": 1.8269510248427414, + "grad_norm": 8.885478019714355, + "learning_rate": 1.9550816252620976e-05, + "loss": 2.7608, + "step": 5877000 + }, + { + "epoch": 1.8271064571232283, + "grad_norm": 9.701618194580078, + "learning_rate": 1.9548225714612863e-05, + "loss": 2.7253, + "step": 5877500 + }, + { + "epoch": 1.8272618894037151, + "grad_norm": 9.14401626586914, + "learning_rate": 1.9545635176604747e-05, + "loss": 2.7086, + "step": 5878000 + }, + { + "epoch": 1.827417321684202, + "grad_norm": 13.770277976989746, + "learning_rate": 1.9543044638596634e-05, + "loss": 2.7031, + "step": 5878500 + }, + { + "epoch": 1.8275727539646889, + "grad_norm": 16.979045867919922, + "learning_rate": 1.954045410058852e-05, + "loss": 2.7259, + "step": 5879000 + }, + { + "epoch": 1.8277281862451757, + "grad_norm": 12.652554512023926, + "learning_rate": 1.9537863562580405e-05, + "loss": 2.7012, + "step": 5879500 + }, + { + "epoch": 1.8278836185256626, + "grad_norm": 10.080662727355957, + "learning_rate": 1.953527302457229e-05, + "loss": 2.728, + "step": 5880000 + }, + { + "epoch": 1.8280390508061495, + "grad_norm": 14.190921783447266, + "learning_rate": 1.9532682486564176e-05, + "loss": 2.6937, + "step": 5880500 + }, + { + "epoch": 1.8281944830866363, + "grad_norm": 8.908350944519043, + "learning_rate": 1.9530091948556063e-05, + "loss": 2.7502, + "step": 5881000 + }, + { + "epoch": 1.8283499153671232, + "grad_norm": 10.617761611938477, + "learning_rate": 1.9527501410547947e-05, + "loss": 2.7239, + "step": 5881500 + }, + { + "epoch": 1.82850534764761, + "grad_norm": 19.790508270263672, + "learning_rate": 1.952491087253983e-05, + "loss": 2.7473, + "step": 5882000 + }, + { + "epoch": 1.828660779928097, + "grad_norm": 10.0953369140625, + "learning_rate": 1.9522320334531718e-05, + "loss": 2.7124, + "step": 5882500 + }, + { + "epoch": 1.8288162122085838, + "grad_norm": 12.104462623596191, + "learning_rate": 1.95197297965236e-05, + "loss": 2.7398, + "step": 5883000 + }, + { + "epoch": 1.8289716444890707, + "grad_norm": 9.189743041992188, + "learning_rate": 1.951713925851549e-05, + "loss": 2.7822, + "step": 5883500 + }, + { + "epoch": 1.8291270767695575, + "grad_norm": 15.80130672454834, + "learning_rate": 1.9514548720507373e-05, + "loss": 2.6892, + "step": 5884000 + }, + { + "epoch": 1.8292825090500444, + "grad_norm": 10.20699691772461, + "learning_rate": 1.951195818249926e-05, + "loss": 2.7085, + "step": 5884500 + }, + { + "epoch": 1.8294379413305313, + "grad_norm": 28.176616668701172, + "learning_rate": 1.9509367644491143e-05, + "loss": 2.7328, + "step": 5885000 + }, + { + "epoch": 1.8295933736110181, + "grad_norm": 9.923495292663574, + "learning_rate": 1.950677710648303e-05, + "loss": 2.7091, + "step": 5885500 + }, + { + "epoch": 1.829748805891505, + "grad_norm": 9.746542930603027, + "learning_rate": 1.9504186568474914e-05, + "loss": 2.7166, + "step": 5886000 + }, + { + "epoch": 1.8299042381719919, + "grad_norm": 16.256473541259766, + "learning_rate": 1.95015960304668e-05, + "loss": 2.7055, + "step": 5886500 + }, + { + "epoch": 1.8300596704524787, + "grad_norm": 10.41937255859375, + "learning_rate": 1.9499005492458685e-05, + "loss": 2.7307, + "step": 5887000 + }, + { + "epoch": 1.8302151027329658, + "grad_norm": 10.279006958007812, + "learning_rate": 1.949641495445057e-05, + "loss": 2.7465, + "step": 5887500 + }, + { + "epoch": 1.8303705350134527, + "grad_norm": 43.319236755371094, + "learning_rate": 1.9493824416442456e-05, + "loss": 2.7311, + "step": 5888000 + }, + { + "epoch": 1.8305259672939396, + "grad_norm": 9.683568954467773, + "learning_rate": 1.9491233878434343e-05, + "loss": 2.7459, + "step": 5888500 + }, + { + "epoch": 1.8306813995744264, + "grad_norm": 7.934906482696533, + "learning_rate": 1.9488643340426227e-05, + "loss": 2.7181, + "step": 5889000 + }, + { + "epoch": 1.8308368318549133, + "grad_norm": 9.999454498291016, + "learning_rate": 1.948605280241811e-05, + "loss": 2.7587, + "step": 5889500 + }, + { + "epoch": 1.8309922641354002, + "grad_norm": 9.68471908569336, + "learning_rate": 1.9483462264409998e-05, + "loss": 2.7287, + "step": 5890000 + }, + { + "epoch": 1.831147696415887, + "grad_norm": 9.355550765991211, + "learning_rate": 1.9480871726401885e-05, + "loss": 2.6956, + "step": 5890500 + }, + { + "epoch": 1.831303128696374, + "grad_norm": 8.581772804260254, + "learning_rate": 1.947828118839377e-05, + "loss": 2.7085, + "step": 5891000 + }, + { + "epoch": 1.8314585609768608, + "grad_norm": 11.115200996398926, + "learning_rate": 1.9475690650385656e-05, + "loss": 2.7096, + "step": 5891500 + }, + { + "epoch": 1.8316139932573476, + "grad_norm": 13.014634132385254, + "learning_rate": 1.947310011237754e-05, + "loss": 2.7693, + "step": 5892000 + }, + { + "epoch": 1.8317694255378345, + "grad_norm": 35.913143157958984, + "learning_rate": 1.9470509574369424e-05, + "loss": 2.7555, + "step": 5892500 + }, + { + "epoch": 1.8319248578183214, + "grad_norm": 38.12331008911133, + "learning_rate": 1.946791903636131e-05, + "loss": 2.7284, + "step": 5893000 + }, + { + "epoch": 1.8320802900988085, + "grad_norm": 14.792180061340332, + "learning_rate": 1.9465328498353198e-05, + "loss": 2.6957, + "step": 5893500 + }, + { + "epoch": 1.8322357223792953, + "grad_norm": 8.212963104248047, + "learning_rate": 1.9462737960345082e-05, + "loss": 2.7047, + "step": 5894000 + }, + { + "epoch": 1.8323911546597822, + "grad_norm": 11.80132007598877, + "learning_rate": 1.9460147422336965e-05, + "loss": 2.7345, + "step": 5894500 + }, + { + "epoch": 1.832546586940269, + "grad_norm": 11.05171012878418, + "learning_rate": 1.945755688432885e-05, + "loss": 2.7176, + "step": 5895000 + }, + { + "epoch": 1.832702019220756, + "grad_norm": 8.943984031677246, + "learning_rate": 1.945496634632074e-05, + "loss": 2.7236, + "step": 5895500 + }, + { + "epoch": 1.8328574515012428, + "grad_norm": 8.694584846496582, + "learning_rate": 1.9452375808312624e-05, + "loss": 2.7387, + "step": 5896000 + }, + { + "epoch": 1.8330128837817297, + "grad_norm": 21.13042640686035, + "learning_rate": 1.9449785270304507e-05, + "loss": 2.7976, + "step": 5896500 + }, + { + "epoch": 1.8331683160622165, + "grad_norm": 10.435752868652344, + "learning_rate": 1.9447194732296394e-05, + "loss": 2.6941, + "step": 5897000 + }, + { + "epoch": 1.8333237483427034, + "grad_norm": 15.268593788146973, + "learning_rate": 1.9444604194288278e-05, + "loss": 2.7249, + "step": 5897500 + }, + { + "epoch": 1.8334791806231903, + "grad_norm": 8.14041519165039, + "learning_rate": 1.9442013656280165e-05, + "loss": 2.768, + "step": 5898000 + }, + { + "epoch": 1.8336346129036771, + "grad_norm": 9.859172821044922, + "learning_rate": 1.943942311827205e-05, + "loss": 2.7386, + "step": 5898500 + }, + { + "epoch": 1.833790045184164, + "grad_norm": 13.861964225769043, + "learning_rate": 1.9436832580263936e-05, + "loss": 2.6944, + "step": 5899000 + }, + { + "epoch": 1.8339454774646509, + "grad_norm": 8.971973419189453, + "learning_rate": 1.943424204225582e-05, + "loss": 2.7052, + "step": 5899500 + }, + { + "epoch": 1.8341009097451377, + "grad_norm": 10.793350219726562, + "learning_rate": 1.9431651504247704e-05, + "loss": 2.7133, + "step": 5900000 + }, + { + "epoch": 1.8342563420256246, + "grad_norm": 16.779129028320312, + "learning_rate": 1.9429060966239594e-05, + "loss": 2.7191, + "step": 5900500 + }, + { + "epoch": 1.8344117743061115, + "grad_norm": 9.559844017028809, + "learning_rate": 1.9426470428231478e-05, + "loss": 2.7358, + "step": 5901000 + }, + { + "epoch": 1.8345672065865983, + "grad_norm": 8.095938682556152, + "learning_rate": 1.9423879890223362e-05, + "loss": 2.7423, + "step": 5901500 + }, + { + "epoch": 1.8347226388670852, + "grad_norm": 10.012239456176758, + "learning_rate": 1.9421289352215246e-05, + "loss": 2.7181, + "step": 5902000 + }, + { + "epoch": 1.834878071147572, + "grad_norm": 37.64577102661133, + "learning_rate": 1.9418698814207133e-05, + "loss": 2.7729, + "step": 5902500 + }, + { + "epoch": 1.835033503428059, + "grad_norm": 14.734095573425293, + "learning_rate": 1.941610827619902e-05, + "loss": 2.7337, + "step": 5903000 + }, + { + "epoch": 1.8351889357085458, + "grad_norm": 9.351375579833984, + "learning_rate": 1.9413517738190904e-05, + "loss": 2.7646, + "step": 5903500 + }, + { + "epoch": 1.8353443679890327, + "grad_norm": 9.02385139465332, + "learning_rate": 1.9410927200182788e-05, + "loss": 2.747, + "step": 5904000 + }, + { + "epoch": 1.8354998002695195, + "grad_norm": 9.420524597167969, + "learning_rate": 1.9408336662174675e-05, + "loss": 2.7371, + "step": 5904500 + }, + { + "epoch": 1.8356552325500064, + "grad_norm": 14.963634490966797, + "learning_rate": 1.940574612416656e-05, + "loss": 2.6964, + "step": 5905000 + }, + { + "epoch": 1.8358106648304933, + "grad_norm": 9.232150077819824, + "learning_rate": 1.9403155586158446e-05, + "loss": 2.6999, + "step": 5905500 + }, + { + "epoch": 1.8359660971109801, + "grad_norm": 9.82095718383789, + "learning_rate": 1.9400565048150333e-05, + "loss": 2.6938, + "step": 5906000 + }, + { + "epoch": 1.836121529391467, + "grad_norm": 9.525404930114746, + "learning_rate": 1.9397974510142217e-05, + "loss": 2.6818, + "step": 5906500 + }, + { + "epoch": 1.8362769616719539, + "grad_norm": 9.706226348876953, + "learning_rate": 1.93953839721341e-05, + "loss": 2.7601, + "step": 5907000 + }, + { + "epoch": 1.8364323939524407, + "grad_norm": 10.267844200134277, + "learning_rate": 1.9392793434125987e-05, + "loss": 2.695, + "step": 5907500 + }, + { + "epoch": 1.8365878262329276, + "grad_norm": 8.982782363891602, + "learning_rate": 1.9390202896117875e-05, + "loss": 2.7454, + "step": 5908000 + }, + { + "epoch": 1.8367432585134145, + "grad_norm": 8.291261672973633, + "learning_rate": 1.938761235810976e-05, + "loss": 2.7409, + "step": 5908500 + }, + { + "epoch": 1.8368986907939013, + "grad_norm": 25.580942153930664, + "learning_rate": 1.9385021820101642e-05, + "loss": 2.7133, + "step": 5909000 + }, + { + "epoch": 1.8370541230743882, + "grad_norm": 26.460512161254883, + "learning_rate": 1.938243128209353e-05, + "loss": 2.7239, + "step": 5909500 + }, + { + "epoch": 1.837209555354875, + "grad_norm": 18.543617248535156, + "learning_rate": 1.9379840744085413e-05, + "loss": 2.6945, + "step": 5910000 + }, + { + "epoch": 1.837364987635362, + "grad_norm": 10.592011451721191, + "learning_rate": 1.93772502060773e-05, + "loss": 2.7798, + "step": 5910500 + }, + { + "epoch": 1.8375204199158488, + "grad_norm": 8.070802688598633, + "learning_rate": 1.9374659668069184e-05, + "loss": 2.6949, + "step": 5911000 + }, + { + "epoch": 1.837675852196336, + "grad_norm": 8.432278633117676, + "learning_rate": 1.937206913006107e-05, + "loss": 2.726, + "step": 5911500 + }, + { + "epoch": 1.8378312844768228, + "grad_norm": 12.222935676574707, + "learning_rate": 1.9369478592052955e-05, + "loss": 2.7027, + "step": 5912000 + }, + { + "epoch": 1.8379867167573096, + "grad_norm": 13.139991760253906, + "learning_rate": 1.9366888054044842e-05, + "loss": 2.7156, + "step": 5912500 + }, + { + "epoch": 1.8381421490377965, + "grad_norm": 7.626698017120361, + "learning_rate": 1.9364297516036726e-05, + "loss": 2.7386, + "step": 5913000 + }, + { + "epoch": 1.8382975813182834, + "grad_norm": 10.76624870300293, + "learning_rate": 1.9361706978028613e-05, + "loss": 2.7045, + "step": 5913500 + }, + { + "epoch": 1.8384530135987702, + "grad_norm": 7.661322116851807, + "learning_rate": 1.9359116440020497e-05, + "loss": 2.7702, + "step": 5914000 + }, + { + "epoch": 1.838608445879257, + "grad_norm": 29.006492614746094, + "learning_rate": 1.935652590201238e-05, + "loss": 2.6757, + "step": 5914500 + }, + { + "epoch": 1.838763878159744, + "grad_norm": 12.747693061828613, + "learning_rate": 1.9353935364004268e-05, + "loss": 2.7486, + "step": 5915000 + }, + { + "epoch": 1.8389193104402308, + "grad_norm": 15.468889236450195, + "learning_rate": 1.9351344825996155e-05, + "loss": 2.7644, + "step": 5915500 + }, + { + "epoch": 1.8390747427207177, + "grad_norm": 10.224059104919434, + "learning_rate": 1.934875428798804e-05, + "loss": 2.6643, + "step": 5916000 + }, + { + "epoch": 1.8392301750012046, + "grad_norm": 12.8605375289917, + "learning_rate": 1.9346163749979922e-05, + "loss": 2.7456, + "step": 5916500 + }, + { + "epoch": 1.8393856072816914, + "grad_norm": 12.451722145080566, + "learning_rate": 1.934357321197181e-05, + "loss": 2.7284, + "step": 5917000 + }, + { + "epoch": 1.8395410395621785, + "grad_norm": 8.850943565368652, + "learning_rate": 1.9340982673963697e-05, + "loss": 2.6971, + "step": 5917500 + }, + { + "epoch": 1.8396964718426654, + "grad_norm": 8.44196605682373, + "learning_rate": 1.933839213595558e-05, + "loss": 2.6907, + "step": 5918000 + }, + { + "epoch": 1.8398519041231522, + "grad_norm": 8.349740982055664, + "learning_rate": 1.9335801597947468e-05, + "loss": 2.7175, + "step": 5918500 + }, + { + "epoch": 1.8400073364036391, + "grad_norm": 10.238503456115723, + "learning_rate": 1.933321105993935e-05, + "loss": 2.6938, + "step": 5919000 + }, + { + "epoch": 1.840162768684126, + "grad_norm": 13.871720314025879, + "learning_rate": 1.9330620521931235e-05, + "loss": 2.7044, + "step": 5919500 + }, + { + "epoch": 1.8403182009646128, + "grad_norm": 10.23159122467041, + "learning_rate": 1.9328029983923122e-05, + "loss": 2.6987, + "step": 5920000 + }, + { + "epoch": 1.8404736332450997, + "grad_norm": 10.171869277954102, + "learning_rate": 1.932543944591501e-05, + "loss": 2.6821, + "step": 5920500 + }, + { + "epoch": 1.8406290655255866, + "grad_norm": 8.394861221313477, + "learning_rate": 1.9322848907906893e-05, + "loss": 2.7144, + "step": 5921000 + }, + { + "epoch": 1.8407844978060734, + "grad_norm": 8.634516716003418, + "learning_rate": 1.9320258369898777e-05, + "loss": 2.7475, + "step": 5921500 + }, + { + "epoch": 1.8409399300865603, + "grad_norm": 10.926794052124023, + "learning_rate": 1.931766783189066e-05, + "loss": 2.7328, + "step": 5922000 + }, + { + "epoch": 1.8410953623670472, + "grad_norm": 20.17596435546875, + "learning_rate": 1.931507729388255e-05, + "loss": 2.7441, + "step": 5922500 + }, + { + "epoch": 1.841250794647534, + "grad_norm": 7.038793087005615, + "learning_rate": 1.9312486755874435e-05, + "loss": 2.7319, + "step": 5923000 + }, + { + "epoch": 1.841406226928021, + "grad_norm": 9.614179611206055, + "learning_rate": 1.930989621786632e-05, + "loss": 2.6824, + "step": 5923500 + }, + { + "epoch": 1.8415616592085078, + "grad_norm": 28.310585021972656, + "learning_rate": 1.9307305679858206e-05, + "loss": 2.7323, + "step": 5924000 + }, + { + "epoch": 1.8417170914889947, + "grad_norm": 10.641923904418945, + "learning_rate": 1.930471514185009e-05, + "loss": 2.6986, + "step": 5924500 + }, + { + "epoch": 1.8418725237694815, + "grad_norm": 8.269367218017578, + "learning_rate": 1.9302124603841977e-05, + "loss": 2.723, + "step": 5925000 + }, + { + "epoch": 1.8420279560499684, + "grad_norm": 7.9528398513793945, + "learning_rate": 1.929953406583386e-05, + "loss": 2.7236, + "step": 5925500 + }, + { + "epoch": 1.8421833883304553, + "grad_norm": 10.368545532226562, + "learning_rate": 1.9296943527825748e-05, + "loss": 2.7448, + "step": 5926000 + }, + { + "epoch": 1.8423388206109421, + "grad_norm": 10.721858978271484, + "learning_rate": 1.929435298981763e-05, + "loss": 2.73, + "step": 5926500 + }, + { + "epoch": 1.842494252891429, + "grad_norm": 19.46860694885254, + "learning_rate": 1.9291762451809515e-05, + "loss": 2.6935, + "step": 5927000 + }, + { + "epoch": 1.8426496851719159, + "grad_norm": 13.3060302734375, + "learning_rate": 1.9289171913801406e-05, + "loss": 2.7041, + "step": 5927500 + }, + { + "epoch": 1.8428051174524027, + "grad_norm": 9.625868797302246, + "learning_rate": 1.928658137579329e-05, + "loss": 2.7376, + "step": 5928000 + }, + { + "epoch": 1.8429605497328896, + "grad_norm": 10.825176239013672, + "learning_rate": 1.9283990837785173e-05, + "loss": 2.705, + "step": 5928500 + }, + { + "epoch": 1.8431159820133765, + "grad_norm": 7.888832092285156, + "learning_rate": 1.9281400299777057e-05, + "loss": 2.7238, + "step": 5929000 + }, + { + "epoch": 1.8432714142938633, + "grad_norm": 12.826652526855469, + "learning_rate": 1.9278809761768944e-05, + "loss": 2.6882, + "step": 5929500 + }, + { + "epoch": 1.8434268465743502, + "grad_norm": 8.385125160217285, + "learning_rate": 1.927621922376083e-05, + "loss": 2.7365, + "step": 5930000 + }, + { + "epoch": 1.843582278854837, + "grad_norm": 15.129448890686035, + "learning_rate": 1.9273628685752715e-05, + "loss": 2.7064, + "step": 5930500 + }, + { + "epoch": 1.843737711135324, + "grad_norm": 7.725428581237793, + "learning_rate": 1.92710381477446e-05, + "loss": 2.729, + "step": 5931000 + }, + { + "epoch": 1.8438931434158108, + "grad_norm": 9.030694961547852, + "learning_rate": 1.9268447609736486e-05, + "loss": 2.7401, + "step": 5931500 + }, + { + "epoch": 1.8440485756962977, + "grad_norm": 10.139049530029297, + "learning_rate": 1.926585707172837e-05, + "loss": 2.6756, + "step": 5932000 + }, + { + "epoch": 1.8442040079767845, + "grad_norm": 10.379364967346191, + "learning_rate": 1.9263266533720257e-05, + "loss": 2.6797, + "step": 5932500 + }, + { + "epoch": 1.8443594402572714, + "grad_norm": 15.418977737426758, + "learning_rate": 1.9260675995712144e-05, + "loss": 2.7338, + "step": 5933000 + }, + { + "epoch": 1.8445148725377583, + "grad_norm": 9.554678916931152, + "learning_rate": 1.9258085457704028e-05, + "loss": 2.6871, + "step": 5933500 + }, + { + "epoch": 1.8446703048182451, + "grad_norm": 7.587292194366455, + "learning_rate": 1.925549491969591e-05, + "loss": 2.7286, + "step": 5934000 + }, + { + "epoch": 1.844825737098732, + "grad_norm": 25.274795532226562, + "learning_rate": 1.92529043816878e-05, + "loss": 2.7487, + "step": 5934500 + }, + { + "epoch": 1.8449811693792189, + "grad_norm": 8.177465438842773, + "learning_rate": 1.9250313843679686e-05, + "loss": 2.7088, + "step": 5935000 + }, + { + "epoch": 1.845136601659706, + "grad_norm": 11.5833101272583, + "learning_rate": 1.924772330567157e-05, + "loss": 2.698, + "step": 5935500 + }, + { + "epoch": 1.8452920339401928, + "grad_norm": 11.329265594482422, + "learning_rate": 1.9245132767663454e-05, + "loss": 2.7057, + "step": 5936000 + }, + { + "epoch": 1.8454474662206797, + "grad_norm": 11.61716079711914, + "learning_rate": 1.924254222965534e-05, + "loss": 2.7304, + "step": 5936500 + }, + { + "epoch": 1.8456028985011665, + "grad_norm": 9.298653602600098, + "learning_rate": 1.9239951691647224e-05, + "loss": 2.7488, + "step": 5937000 + }, + { + "epoch": 1.8457583307816534, + "grad_norm": 9.808527946472168, + "learning_rate": 1.923736115363911e-05, + "loss": 2.713, + "step": 5937500 + }, + { + "epoch": 1.8459137630621403, + "grad_norm": 11.813005447387695, + "learning_rate": 1.9234770615630995e-05, + "loss": 2.7666, + "step": 5938000 + }, + { + "epoch": 1.8460691953426271, + "grad_norm": 10.232032775878906, + "learning_rate": 1.9232180077622883e-05, + "loss": 2.6877, + "step": 5938500 + }, + { + "epoch": 1.846224627623114, + "grad_norm": 27.437660217285156, + "learning_rate": 1.9229589539614766e-05, + "loss": 2.7559, + "step": 5939000 + }, + { + "epoch": 1.8463800599036009, + "grad_norm": 9.399230003356934, + "learning_rate": 1.9226999001606653e-05, + "loss": 2.6854, + "step": 5939500 + }, + { + "epoch": 1.8465354921840877, + "grad_norm": 10.569805145263672, + "learning_rate": 1.9224408463598537e-05, + "loss": 2.749, + "step": 5940000 + }, + { + "epoch": 1.8466909244645746, + "grad_norm": 8.226483345031738, + "learning_rate": 1.9221817925590424e-05, + "loss": 2.735, + "step": 5940500 + }, + { + "epoch": 1.8468463567450615, + "grad_norm": 8.651368141174316, + "learning_rate": 1.9219227387582308e-05, + "loss": 2.6976, + "step": 5941000 + }, + { + "epoch": 1.8470017890255486, + "grad_norm": 7.922641277313232, + "learning_rate": 1.9216636849574192e-05, + "loss": 2.6565, + "step": 5941500 + }, + { + "epoch": 1.8471572213060354, + "grad_norm": 10.462353706359863, + "learning_rate": 1.921404631156608e-05, + "loss": 2.683, + "step": 5942000 + }, + { + "epoch": 1.8473126535865223, + "grad_norm": 9.672706604003906, + "learning_rate": 1.9211455773557966e-05, + "loss": 2.7511, + "step": 5942500 + }, + { + "epoch": 1.8474680858670092, + "grad_norm": 15.276187896728516, + "learning_rate": 1.920886523554985e-05, + "loss": 2.6965, + "step": 5943000 + }, + { + "epoch": 1.847623518147496, + "grad_norm": 8.29670238494873, + "learning_rate": 1.9206274697541734e-05, + "loss": 2.7091, + "step": 5943500 + }, + { + "epoch": 1.847778950427983, + "grad_norm": 25.461326599121094, + "learning_rate": 1.920368415953362e-05, + "loss": 2.7121, + "step": 5944000 + }, + { + "epoch": 1.8479343827084698, + "grad_norm": 8.935993194580078, + "learning_rate": 1.9201093621525508e-05, + "loss": 2.6947, + "step": 5944500 + }, + { + "epoch": 1.8480898149889566, + "grad_norm": 13.411640167236328, + "learning_rate": 1.9198503083517392e-05, + "loss": 2.7519, + "step": 5945000 + }, + { + "epoch": 1.8482452472694435, + "grad_norm": 9.234545707702637, + "learning_rate": 1.919591254550928e-05, + "loss": 2.6927, + "step": 5945500 + }, + { + "epoch": 1.8484006795499304, + "grad_norm": 9.544713020324707, + "learning_rate": 1.9193322007501163e-05, + "loss": 2.7239, + "step": 5946000 + }, + { + "epoch": 1.8485561118304172, + "grad_norm": 7.762690544128418, + "learning_rate": 1.9190731469493046e-05, + "loss": 2.6966, + "step": 5946500 + }, + { + "epoch": 1.848711544110904, + "grad_norm": 7.6037211418151855, + "learning_rate": 1.9188140931484934e-05, + "loss": 2.7314, + "step": 5947000 + }, + { + "epoch": 1.848866976391391, + "grad_norm": 11.257919311523438, + "learning_rate": 1.918555039347682e-05, + "loss": 2.6925, + "step": 5947500 + }, + { + "epoch": 1.8490224086718778, + "grad_norm": 9.605570793151855, + "learning_rate": 1.9182959855468705e-05, + "loss": 2.7062, + "step": 5948000 + }, + { + "epoch": 1.8491778409523647, + "grad_norm": 10.116650581359863, + "learning_rate": 1.918036931746059e-05, + "loss": 2.7345, + "step": 5948500 + }, + { + "epoch": 1.8493332732328516, + "grad_norm": 16.527259826660156, + "learning_rate": 1.9177778779452475e-05, + "loss": 2.7397, + "step": 5949000 + }, + { + "epoch": 1.8494887055133384, + "grad_norm": 6.497499942779541, + "learning_rate": 1.9175188241444363e-05, + "loss": 2.797, + "step": 5949500 + }, + { + "epoch": 1.8496441377938253, + "grad_norm": 9.072338104248047, + "learning_rate": 1.9172597703436246e-05, + "loss": 2.7022, + "step": 5950000 + }, + { + "epoch": 1.8497995700743122, + "grad_norm": 19.402467727661133, + "learning_rate": 1.917000716542813e-05, + "loss": 2.6948, + "step": 5950500 + }, + { + "epoch": 1.849955002354799, + "grad_norm": 10.65253734588623, + "learning_rate": 1.9167416627420017e-05, + "loss": 2.7924, + "step": 5951000 + }, + { + "epoch": 1.850110434635286, + "grad_norm": 10.416947364807129, + "learning_rate": 1.91648260894119e-05, + "loss": 2.7405, + "step": 5951500 + }, + { + "epoch": 1.8502658669157728, + "grad_norm": 8.454465866088867, + "learning_rate": 1.9162235551403788e-05, + "loss": 2.7158, + "step": 5952000 + }, + { + "epoch": 1.8504212991962596, + "grad_norm": 8.311147689819336, + "learning_rate": 1.9159645013395672e-05, + "loss": 2.7229, + "step": 5952500 + }, + { + "epoch": 1.8505767314767465, + "grad_norm": 9.567471504211426, + "learning_rate": 1.915705447538756e-05, + "loss": 2.7267, + "step": 5953000 + }, + { + "epoch": 1.8507321637572334, + "grad_norm": 11.896867752075195, + "learning_rate": 1.9154463937379443e-05, + "loss": 2.7154, + "step": 5953500 + }, + { + "epoch": 1.8508875960377202, + "grad_norm": 8.767813682556152, + "learning_rate": 1.915187339937133e-05, + "loss": 2.721, + "step": 5954000 + }, + { + "epoch": 1.851043028318207, + "grad_norm": 8.914344787597656, + "learning_rate": 1.9149282861363217e-05, + "loss": 2.7798, + "step": 5954500 + }, + { + "epoch": 1.851198460598694, + "grad_norm": 11.24909782409668, + "learning_rate": 1.91466923233551e-05, + "loss": 2.7794, + "step": 5955000 + }, + { + "epoch": 1.8513538928791808, + "grad_norm": 9.535077095031738, + "learning_rate": 1.9144101785346985e-05, + "loss": 2.7028, + "step": 5955500 + }, + { + "epoch": 1.8515093251596677, + "grad_norm": 9.217939376831055, + "learning_rate": 1.914151124733887e-05, + "loss": 2.6761, + "step": 5956000 + }, + { + "epoch": 1.8516647574401546, + "grad_norm": 8.917879104614258, + "learning_rate": 1.9138920709330756e-05, + "loss": 2.7471, + "step": 5956500 + }, + { + "epoch": 1.8518201897206414, + "grad_norm": 11.467935562133789, + "learning_rate": 1.9136330171322643e-05, + "loss": 2.7902, + "step": 5957000 + }, + { + "epoch": 1.8519756220011283, + "grad_norm": 9.185526847839355, + "learning_rate": 1.9133739633314527e-05, + "loss": 2.7533, + "step": 5957500 + }, + { + "epoch": 1.8521310542816152, + "grad_norm": 8.987043380737305, + "learning_rate": 1.913114909530641e-05, + "loss": 2.7127, + "step": 5958000 + }, + { + "epoch": 1.852286486562102, + "grad_norm": 12.569496154785156, + "learning_rate": 1.9128558557298298e-05, + "loss": 2.7279, + "step": 5958500 + }, + { + "epoch": 1.852441918842589, + "grad_norm": 6.556455135345459, + "learning_rate": 1.9125968019290185e-05, + "loss": 2.6933, + "step": 5959000 + }, + { + "epoch": 1.852597351123076, + "grad_norm": 9.647444725036621, + "learning_rate": 1.912337748128207e-05, + "loss": 2.703, + "step": 5959500 + }, + { + "epoch": 1.8527527834035629, + "grad_norm": 9.490828514099121, + "learning_rate": 1.9120786943273956e-05, + "loss": 2.7481, + "step": 5960000 + }, + { + "epoch": 1.8529082156840497, + "grad_norm": 8.098514556884766, + "learning_rate": 1.911819640526584e-05, + "loss": 2.7355, + "step": 5960500 + }, + { + "epoch": 1.8530636479645366, + "grad_norm": 14.062032699584961, + "learning_rate": 1.9115605867257723e-05, + "loss": 2.718, + "step": 5961000 + }, + { + "epoch": 1.8532190802450235, + "grad_norm": 10.354594230651855, + "learning_rate": 1.911301532924961e-05, + "loss": 2.6747, + "step": 5961500 + }, + { + "epoch": 1.8533745125255103, + "grad_norm": 7.11743688583374, + "learning_rate": 1.9110424791241497e-05, + "loss": 2.7038, + "step": 5962000 + }, + { + "epoch": 1.8535299448059972, + "grad_norm": 9.173131942749023, + "learning_rate": 1.910783425323338e-05, + "loss": 2.7122, + "step": 5962500 + }, + { + "epoch": 1.853685377086484, + "grad_norm": 11.458627700805664, + "learning_rate": 1.9105243715225265e-05, + "loss": 2.6981, + "step": 5963000 + }, + { + "epoch": 1.853840809366971, + "grad_norm": 6.7030816078186035, + "learning_rate": 1.9102653177217152e-05, + "loss": 2.7447, + "step": 5963500 + }, + { + "epoch": 1.8539962416474578, + "grad_norm": 9.290349960327148, + "learning_rate": 1.910006263920904e-05, + "loss": 2.7349, + "step": 5964000 + }, + { + "epoch": 1.8541516739279447, + "grad_norm": 14.515789031982422, + "learning_rate": 1.9097472101200923e-05, + "loss": 2.7455, + "step": 5964500 + }, + { + "epoch": 1.8543071062084315, + "grad_norm": 6.690277099609375, + "learning_rate": 1.9094881563192807e-05, + "loss": 2.709, + "step": 5965000 + }, + { + "epoch": 1.8544625384889186, + "grad_norm": 8.801326751708984, + "learning_rate": 1.9092291025184694e-05, + "loss": 2.7177, + "step": 5965500 + }, + { + "epoch": 1.8546179707694055, + "grad_norm": 9.932028770446777, + "learning_rate": 1.9089700487176578e-05, + "loss": 2.7538, + "step": 5966000 + }, + { + "epoch": 1.8547734030498924, + "grad_norm": 10.456613540649414, + "learning_rate": 1.9087109949168465e-05, + "loss": 2.7206, + "step": 5966500 + }, + { + "epoch": 1.8549288353303792, + "grad_norm": 9.478036880493164, + "learning_rate": 1.908451941116035e-05, + "loss": 2.7155, + "step": 5967000 + }, + { + "epoch": 1.855084267610866, + "grad_norm": 10.10958194732666, + "learning_rate": 1.9081928873152236e-05, + "loss": 2.6696, + "step": 5967500 + }, + { + "epoch": 1.855239699891353, + "grad_norm": 10.354378700256348, + "learning_rate": 1.907933833514412e-05, + "loss": 2.6815, + "step": 5968000 + }, + { + "epoch": 1.8553951321718398, + "grad_norm": 9.127056121826172, + "learning_rate": 1.9076747797136003e-05, + "loss": 2.7086, + "step": 5968500 + }, + { + "epoch": 1.8555505644523267, + "grad_norm": 10.542411804199219, + "learning_rate": 1.9074157259127894e-05, + "loss": 2.6656, + "step": 5969000 + }, + { + "epoch": 1.8557059967328136, + "grad_norm": 7.8825225830078125, + "learning_rate": 1.9071566721119778e-05, + "loss": 2.7084, + "step": 5969500 + }, + { + "epoch": 1.8558614290133004, + "grad_norm": 11.029184341430664, + "learning_rate": 1.906897618311166e-05, + "loss": 2.6905, + "step": 5970000 + }, + { + "epoch": 1.8560168612937873, + "grad_norm": 23.422889709472656, + "learning_rate": 1.9066385645103545e-05, + "loss": 2.689, + "step": 5970500 + }, + { + "epoch": 1.8561722935742742, + "grad_norm": 29.744596481323242, + "learning_rate": 1.9063795107095432e-05, + "loss": 2.7557, + "step": 5971000 + }, + { + "epoch": 1.856327725854761, + "grad_norm": 8.912837028503418, + "learning_rate": 1.906120456908732e-05, + "loss": 2.7393, + "step": 5971500 + }, + { + "epoch": 1.856483158135248, + "grad_norm": 10.231221199035645, + "learning_rate": 1.9058614031079203e-05, + "loss": 2.7304, + "step": 5972000 + }, + { + "epoch": 1.8566385904157348, + "grad_norm": 12.800875663757324, + "learning_rate": 1.905602349307109e-05, + "loss": 2.6952, + "step": 5972500 + }, + { + "epoch": 1.8567940226962216, + "grad_norm": 9.41718578338623, + "learning_rate": 1.9053432955062974e-05, + "loss": 2.6633, + "step": 5973000 + }, + { + "epoch": 1.8569494549767085, + "grad_norm": 9.087386131286621, + "learning_rate": 1.9050842417054858e-05, + "loss": 2.704, + "step": 5973500 + }, + { + "epoch": 1.8571048872571954, + "grad_norm": 11.86451530456543, + "learning_rate": 1.9048251879046745e-05, + "loss": 2.7262, + "step": 5974000 + }, + { + "epoch": 1.8572603195376822, + "grad_norm": 8.707900047302246, + "learning_rate": 1.9045661341038632e-05, + "loss": 2.723, + "step": 5974500 + }, + { + "epoch": 1.857415751818169, + "grad_norm": 10.047426223754883, + "learning_rate": 1.9043070803030516e-05, + "loss": 2.7224, + "step": 5975000 + }, + { + "epoch": 1.857571184098656, + "grad_norm": 9.927544593811035, + "learning_rate": 1.90404802650224e-05, + "loss": 2.7136, + "step": 5975500 + }, + { + "epoch": 1.8577266163791428, + "grad_norm": 10.664999008178711, + "learning_rate": 1.9037889727014287e-05, + "loss": 2.7005, + "step": 5976000 + }, + { + "epoch": 1.8578820486596297, + "grad_norm": 14.082316398620605, + "learning_rate": 1.9035299189006174e-05, + "loss": 2.7018, + "step": 5976500 + }, + { + "epoch": 1.8580374809401166, + "grad_norm": 9.540850639343262, + "learning_rate": 1.9032708650998058e-05, + "loss": 2.7316, + "step": 5977000 + }, + { + "epoch": 1.8581929132206034, + "grad_norm": 9.959388732910156, + "learning_rate": 1.903011811298994e-05, + "loss": 2.7176, + "step": 5977500 + }, + { + "epoch": 1.8583483455010903, + "grad_norm": 13.852478981018066, + "learning_rate": 1.902752757498183e-05, + "loss": 2.7302, + "step": 5978000 + }, + { + "epoch": 1.8585037777815772, + "grad_norm": 9.433220863342285, + "learning_rate": 1.9024937036973712e-05, + "loss": 2.7215, + "step": 5978500 + }, + { + "epoch": 1.858659210062064, + "grad_norm": 8.754587173461914, + "learning_rate": 1.90223464989656e-05, + "loss": 2.6912, + "step": 5979000 + }, + { + "epoch": 1.858814642342551, + "grad_norm": 8.825477600097656, + "learning_rate": 1.9019755960957483e-05, + "loss": 2.7057, + "step": 5979500 + }, + { + "epoch": 1.8589700746230378, + "grad_norm": 8.616562843322754, + "learning_rate": 1.901716542294937e-05, + "loss": 2.7409, + "step": 5980000 + }, + { + "epoch": 1.8591255069035246, + "grad_norm": 20.83783721923828, + "learning_rate": 1.9014574884941254e-05, + "loss": 2.7232, + "step": 5980500 + }, + { + "epoch": 1.8592809391840115, + "grad_norm": 10.940361976623535, + "learning_rate": 1.901198434693314e-05, + "loss": 2.695, + "step": 5981000 + }, + { + "epoch": 1.8594363714644984, + "grad_norm": 8.359601020812988, + "learning_rate": 1.900939380892503e-05, + "loss": 2.74, + "step": 5981500 + }, + { + "epoch": 1.8595918037449852, + "grad_norm": 10.998103141784668, + "learning_rate": 1.9006803270916912e-05, + "loss": 2.6979, + "step": 5982000 + }, + { + "epoch": 1.859747236025472, + "grad_norm": 10.594416618347168, + "learning_rate": 1.9004212732908796e-05, + "loss": 2.7165, + "step": 5982500 + }, + { + "epoch": 1.859902668305959, + "grad_norm": 14.521500587463379, + "learning_rate": 1.900162219490068e-05, + "loss": 2.7183, + "step": 5983000 + }, + { + "epoch": 1.8600581005864458, + "grad_norm": 8.207060813903809, + "learning_rate": 1.8999031656892567e-05, + "loss": 2.7476, + "step": 5983500 + }, + { + "epoch": 1.860213532866933, + "grad_norm": 10.125555992126465, + "learning_rate": 1.8996441118884454e-05, + "loss": 2.6769, + "step": 5984000 + }, + { + "epoch": 1.8603689651474198, + "grad_norm": 9.838946342468262, + "learning_rate": 1.8993850580876338e-05, + "loss": 2.7106, + "step": 5984500 + }, + { + "epoch": 1.8605243974279067, + "grad_norm": 18.96656036376953, + "learning_rate": 1.8991260042868225e-05, + "loss": 2.7085, + "step": 5985000 + }, + { + "epoch": 1.8606798297083935, + "grad_norm": 8.443427085876465, + "learning_rate": 1.898866950486011e-05, + "loss": 2.7116, + "step": 5985500 + }, + { + "epoch": 1.8608352619888804, + "grad_norm": 10.044112205505371, + "learning_rate": 1.8986078966851996e-05, + "loss": 2.7474, + "step": 5986000 + }, + { + "epoch": 1.8609906942693673, + "grad_norm": 9.11677360534668, + "learning_rate": 1.898348842884388e-05, + "loss": 2.702, + "step": 5986500 + }, + { + "epoch": 1.8611461265498541, + "grad_norm": 9.114253997802734, + "learning_rate": 1.8980897890835767e-05, + "loss": 2.677, + "step": 5987000 + }, + { + "epoch": 1.861301558830341, + "grad_norm": 9.97287654876709, + "learning_rate": 1.897830735282765e-05, + "loss": 2.7716, + "step": 5987500 + }, + { + "epoch": 1.8614569911108279, + "grad_norm": 11.318787574768066, + "learning_rate": 1.8975716814819535e-05, + "loss": 2.7116, + "step": 5988000 + }, + { + "epoch": 1.8616124233913147, + "grad_norm": 11.656963348388672, + "learning_rate": 1.897312627681142e-05, + "loss": 2.6872, + "step": 5988500 + }, + { + "epoch": 1.8617678556718016, + "grad_norm": 19.69732666015625, + "learning_rate": 1.897053573880331e-05, + "loss": 2.7115, + "step": 5989000 + }, + { + "epoch": 1.8619232879522885, + "grad_norm": 9.810626029968262, + "learning_rate": 1.8967945200795193e-05, + "loss": 2.7393, + "step": 5989500 + }, + { + "epoch": 1.8620787202327755, + "grad_norm": 27.42306900024414, + "learning_rate": 1.8965354662787076e-05, + "loss": 2.738, + "step": 5990000 + }, + { + "epoch": 1.8622341525132624, + "grad_norm": 8.645700454711914, + "learning_rate": 1.8962764124778964e-05, + "loss": 2.7146, + "step": 5990500 + }, + { + "epoch": 1.8623895847937493, + "grad_norm": 9.055633544921875, + "learning_rate": 1.896017358677085e-05, + "loss": 2.6542, + "step": 5991000 + }, + { + "epoch": 1.8625450170742361, + "grad_norm": 6.763203144073486, + "learning_rate": 1.8957583048762734e-05, + "loss": 2.7482, + "step": 5991500 + }, + { + "epoch": 1.862700449354723, + "grad_norm": 12.311690330505371, + "learning_rate": 1.8954992510754618e-05, + "loss": 2.7113, + "step": 5992000 + }, + { + "epoch": 1.8628558816352099, + "grad_norm": 10.08047103881836, + "learning_rate": 1.8952401972746505e-05, + "loss": 2.7016, + "step": 5992500 + }, + { + "epoch": 1.8630113139156967, + "grad_norm": 16.26807975769043, + "learning_rate": 1.894981143473839e-05, + "loss": 2.7484, + "step": 5993000 + }, + { + "epoch": 1.8631667461961836, + "grad_norm": 40.4676399230957, + "learning_rate": 1.8947220896730276e-05, + "loss": 2.7574, + "step": 5993500 + }, + { + "epoch": 1.8633221784766705, + "grad_norm": 6.964455604553223, + "learning_rate": 1.8944630358722163e-05, + "loss": 2.6713, + "step": 5994000 + }, + { + "epoch": 1.8634776107571573, + "grad_norm": 7.996394634246826, + "learning_rate": 1.8942039820714047e-05, + "loss": 2.7252, + "step": 5994500 + }, + { + "epoch": 1.8636330430376442, + "grad_norm": 10.141022682189941, + "learning_rate": 1.893944928270593e-05, + "loss": 2.7328, + "step": 5995000 + }, + { + "epoch": 1.863788475318131, + "grad_norm": 9.53536605834961, + "learning_rate": 1.8936858744697815e-05, + "loss": 2.6833, + "step": 5995500 + }, + { + "epoch": 1.863943907598618, + "grad_norm": 10.91337776184082, + "learning_rate": 1.8934268206689705e-05, + "loss": 2.7252, + "step": 5996000 + }, + { + "epoch": 1.8640993398791048, + "grad_norm": 11.067663192749023, + "learning_rate": 1.893167766868159e-05, + "loss": 2.7145, + "step": 5996500 + }, + { + "epoch": 1.8642547721595917, + "grad_norm": 46.336158752441406, + "learning_rate": 1.8929087130673473e-05, + "loss": 2.6702, + "step": 5997000 + }, + { + "epoch": 1.8644102044400785, + "grad_norm": 11.156062126159668, + "learning_rate": 1.8926496592665357e-05, + "loss": 2.7282, + "step": 5997500 + }, + { + "epoch": 1.8645656367205654, + "grad_norm": 9.646936416625977, + "learning_rate": 1.8923906054657244e-05, + "loss": 2.7222, + "step": 5998000 + }, + { + "epoch": 1.8647210690010523, + "grad_norm": 11.159334182739258, + "learning_rate": 1.892131551664913e-05, + "loss": 2.7015, + "step": 5998500 + }, + { + "epoch": 1.8648765012815391, + "grad_norm": 9.816084861755371, + "learning_rate": 1.8918724978641015e-05, + "loss": 2.7119, + "step": 5999000 + }, + { + "epoch": 1.865031933562026, + "grad_norm": 8.944392204284668, + "learning_rate": 1.8916134440632902e-05, + "loss": 2.7214, + "step": 5999500 + }, + { + "epoch": 1.8651873658425129, + "grad_norm": 10.209685325622559, + "learning_rate": 1.8913543902624786e-05, + "loss": 2.7178, + "step": 6000000 + }, + { + "epoch": 1.8653427981229997, + "grad_norm": 9.862903594970703, + "learning_rate": 1.891095336461667e-05, + "loss": 2.7121, + "step": 6000500 + }, + { + "epoch": 1.8654982304034866, + "grad_norm": 15.351444244384766, + "learning_rate": 1.8908362826608556e-05, + "loss": 2.7047, + "step": 6001000 + }, + { + "epoch": 1.8656536626839735, + "grad_norm": 10.708128929138184, + "learning_rate": 1.8905772288600444e-05, + "loss": 2.7192, + "step": 6001500 + }, + { + "epoch": 1.8658090949644603, + "grad_norm": 8.787737846374512, + "learning_rate": 1.8903181750592327e-05, + "loss": 2.755, + "step": 6002000 + }, + { + "epoch": 1.8659645272449472, + "grad_norm": 9.488152503967285, + "learning_rate": 1.890059121258421e-05, + "loss": 2.6718, + "step": 6002500 + }, + { + "epoch": 1.866119959525434, + "grad_norm": 10.23861312866211, + "learning_rate": 1.8898000674576098e-05, + "loss": 2.706, + "step": 6003000 + }, + { + "epoch": 1.866275391805921, + "grad_norm": 9.54845142364502, + "learning_rate": 1.8895410136567985e-05, + "loss": 2.7107, + "step": 6003500 + }, + { + "epoch": 1.8664308240864078, + "grad_norm": 12.82939624786377, + "learning_rate": 1.889281959855987e-05, + "loss": 2.7442, + "step": 6004000 + }, + { + "epoch": 1.8665862563668947, + "grad_norm": 11.725180625915527, + "learning_rate": 1.8890229060551753e-05, + "loss": 2.722, + "step": 6004500 + }, + { + "epoch": 1.8667416886473815, + "grad_norm": 8.298314094543457, + "learning_rate": 1.888763852254364e-05, + "loss": 2.7187, + "step": 6005000 + }, + { + "epoch": 1.8668971209278684, + "grad_norm": 19.696535110473633, + "learning_rate": 1.8885047984535524e-05, + "loss": 2.6997, + "step": 6005500 + }, + { + "epoch": 1.8670525532083553, + "grad_norm": 9.980474472045898, + "learning_rate": 1.888245744652741e-05, + "loss": 2.7241, + "step": 6006000 + }, + { + "epoch": 1.8672079854888421, + "grad_norm": 8.6727294921875, + "learning_rate": 1.8879866908519295e-05, + "loss": 2.7219, + "step": 6006500 + }, + { + "epoch": 1.867363417769329, + "grad_norm": 13.35693645477295, + "learning_rate": 1.8877276370511182e-05, + "loss": 2.7434, + "step": 6007000 + }, + { + "epoch": 1.8675188500498159, + "grad_norm": 11.453672409057617, + "learning_rate": 1.8874685832503066e-05, + "loss": 2.7178, + "step": 6007500 + }, + { + "epoch": 1.867674282330303, + "grad_norm": 13.806238174438477, + "learning_rate": 1.8872095294494953e-05, + "loss": 2.7325, + "step": 6008000 + }, + { + "epoch": 1.8678297146107898, + "grad_norm": 9.647372245788574, + "learning_rate": 1.886950475648684e-05, + "loss": 2.7312, + "step": 6008500 + }, + { + "epoch": 1.8679851468912767, + "grad_norm": 9.968746185302734, + "learning_rate": 1.8866914218478724e-05, + "loss": 2.6886, + "step": 6009000 + }, + { + "epoch": 1.8681405791717636, + "grad_norm": 11.407791137695312, + "learning_rate": 1.8864323680470608e-05, + "loss": 2.7509, + "step": 6009500 + }, + { + "epoch": 1.8682960114522504, + "grad_norm": 9.343252182006836, + "learning_rate": 1.886173314246249e-05, + "loss": 2.7446, + "step": 6010000 + }, + { + "epoch": 1.8684514437327373, + "grad_norm": 9.00916576385498, + "learning_rate": 1.885914260445438e-05, + "loss": 2.7535, + "step": 6010500 + }, + { + "epoch": 1.8686068760132242, + "grad_norm": 8.590598106384277, + "learning_rate": 1.8856552066446266e-05, + "loss": 2.7409, + "step": 6011000 + }, + { + "epoch": 1.868762308293711, + "grad_norm": 8.755053520202637, + "learning_rate": 1.885396152843815e-05, + "loss": 2.6883, + "step": 6011500 + }, + { + "epoch": 1.868917740574198, + "grad_norm": 22.134408950805664, + "learning_rate": 1.8851370990430037e-05, + "loss": 2.7563, + "step": 6012000 + }, + { + "epoch": 1.8690731728546848, + "grad_norm": 8.16431713104248, + "learning_rate": 1.884878045242192e-05, + "loss": 2.7121, + "step": 6012500 + }, + { + "epoch": 1.8692286051351716, + "grad_norm": 10.434492111206055, + "learning_rate": 1.8846189914413807e-05, + "loss": 2.7375, + "step": 6013000 + }, + { + "epoch": 1.8693840374156585, + "grad_norm": 9.145423889160156, + "learning_rate": 1.884359937640569e-05, + "loss": 2.7609, + "step": 6013500 + }, + { + "epoch": 1.8695394696961456, + "grad_norm": 11.980125427246094, + "learning_rate": 1.884100883839758e-05, + "loss": 2.6817, + "step": 6014000 + }, + { + "epoch": 1.8696949019766325, + "grad_norm": 35.4970588684082, + "learning_rate": 1.8838418300389462e-05, + "loss": 2.6739, + "step": 6014500 + }, + { + "epoch": 1.8698503342571193, + "grad_norm": 5.8360700607299805, + "learning_rate": 1.8835827762381346e-05, + "loss": 2.716, + "step": 6015000 + }, + { + "epoch": 1.8700057665376062, + "grad_norm": 9.324596405029297, + "learning_rate": 1.8833237224373233e-05, + "loss": 2.7125, + "step": 6015500 + }, + { + "epoch": 1.870161198818093, + "grad_norm": 9.270745277404785, + "learning_rate": 1.883064668636512e-05, + "loss": 2.6994, + "step": 6016000 + }, + { + "epoch": 1.87031663109858, + "grad_norm": 8.671927452087402, + "learning_rate": 1.8828056148357004e-05, + "loss": 2.7442, + "step": 6016500 + }, + { + "epoch": 1.8704720633790668, + "grad_norm": 9.703879356384277, + "learning_rate": 1.8825465610348888e-05, + "loss": 2.7314, + "step": 6017000 + }, + { + "epoch": 1.8706274956595537, + "grad_norm": 10.165336608886719, + "learning_rate": 1.8822875072340775e-05, + "loss": 2.6803, + "step": 6017500 + }, + { + "epoch": 1.8707829279400405, + "grad_norm": 9.588770866394043, + "learning_rate": 1.8820284534332662e-05, + "loss": 2.7547, + "step": 6018000 + }, + { + "epoch": 1.8709383602205274, + "grad_norm": 10.017268180847168, + "learning_rate": 1.8817693996324546e-05, + "loss": 2.7114, + "step": 6018500 + }, + { + "epoch": 1.8710937925010143, + "grad_norm": 9.89673900604248, + "learning_rate": 1.881510345831643e-05, + "loss": 2.7328, + "step": 6019000 + }, + { + "epoch": 1.8712492247815011, + "grad_norm": 9.233521461486816, + "learning_rate": 1.8812512920308317e-05, + "loss": 2.6848, + "step": 6019500 + }, + { + "epoch": 1.871404657061988, + "grad_norm": 17.387426376342773, + "learning_rate": 1.88099223823002e-05, + "loss": 2.7156, + "step": 6020000 + }, + { + "epoch": 1.8715600893424749, + "grad_norm": 14.965561866760254, + "learning_rate": 1.8807331844292088e-05, + "loss": 2.7103, + "step": 6020500 + }, + { + "epoch": 1.8717155216229617, + "grad_norm": 11.001607894897461, + "learning_rate": 1.8804741306283975e-05, + "loss": 2.7427, + "step": 6021000 + }, + { + "epoch": 1.8718709539034486, + "grad_norm": 12.438023567199707, + "learning_rate": 1.880215076827586e-05, + "loss": 2.7689, + "step": 6021500 + }, + { + "epoch": 1.8720263861839355, + "grad_norm": 11.470542907714844, + "learning_rate": 1.8799560230267742e-05, + "loss": 2.7032, + "step": 6022000 + }, + { + "epoch": 1.8721818184644223, + "grad_norm": 8.920654296875, + "learning_rate": 1.8796969692259626e-05, + "loss": 2.71, + "step": 6022500 + }, + { + "epoch": 1.8723372507449092, + "grad_norm": 9.67846393585205, + "learning_rate": 1.8794379154251517e-05, + "loss": 2.7003, + "step": 6023000 + }, + { + "epoch": 1.872492683025396, + "grad_norm": 10.998493194580078, + "learning_rate": 1.87917886162434e-05, + "loss": 2.706, + "step": 6023500 + }, + { + "epoch": 1.872648115305883, + "grad_norm": 9.83278751373291, + "learning_rate": 1.8789198078235284e-05, + "loss": 2.6887, + "step": 6024000 + }, + { + "epoch": 1.8728035475863698, + "grad_norm": 8.053661346435547, + "learning_rate": 1.8786607540227168e-05, + "loss": 2.7407, + "step": 6024500 + }, + { + "epoch": 1.8729589798668567, + "grad_norm": 10.559992790222168, + "learning_rate": 1.8784017002219055e-05, + "loss": 2.6924, + "step": 6025000 + }, + { + "epoch": 1.8731144121473435, + "grad_norm": 10.653428077697754, + "learning_rate": 1.8781426464210942e-05, + "loss": 2.7263, + "step": 6025500 + }, + { + "epoch": 1.8732698444278304, + "grad_norm": 10.865760803222656, + "learning_rate": 1.8778835926202826e-05, + "loss": 2.7241, + "step": 6026000 + }, + { + "epoch": 1.8734252767083173, + "grad_norm": 7.815793991088867, + "learning_rate": 1.8776245388194713e-05, + "loss": 2.7191, + "step": 6026500 + }, + { + "epoch": 1.8735807089888041, + "grad_norm": 9.53984546661377, + "learning_rate": 1.8773654850186597e-05, + "loss": 2.7544, + "step": 6027000 + }, + { + "epoch": 1.873736141269291, + "grad_norm": 12.637049674987793, + "learning_rate": 1.877106431217848e-05, + "loss": 2.7144, + "step": 6027500 + }, + { + "epoch": 1.8738915735497779, + "grad_norm": 8.477142333984375, + "learning_rate": 1.8768473774170368e-05, + "loss": 2.737, + "step": 6028000 + }, + { + "epoch": 1.8740470058302647, + "grad_norm": 8.033339500427246, + "learning_rate": 1.8765883236162255e-05, + "loss": 2.6925, + "step": 6028500 + }, + { + "epoch": 1.8742024381107516, + "grad_norm": 8.919971466064453, + "learning_rate": 1.876329269815414e-05, + "loss": 2.6863, + "step": 6029000 + }, + { + "epoch": 1.8743578703912385, + "grad_norm": 8.095599174499512, + "learning_rate": 1.8760702160146023e-05, + "loss": 2.7063, + "step": 6029500 + }, + { + "epoch": 1.8745133026717253, + "grad_norm": 8.461544036865234, + "learning_rate": 1.875811162213791e-05, + "loss": 2.715, + "step": 6030000 + }, + { + "epoch": 1.8746687349522122, + "grad_norm": 10.566699028015137, + "learning_rate": 1.8755521084129797e-05, + "loss": 2.6767, + "step": 6030500 + }, + { + "epoch": 1.874824167232699, + "grad_norm": 18.42547035217285, + "learning_rate": 1.875293054612168e-05, + "loss": 2.6895, + "step": 6031000 + }, + { + "epoch": 1.874979599513186, + "grad_norm": 9.669187545776367, + "learning_rate": 1.8750340008113564e-05, + "loss": 2.7522, + "step": 6031500 + }, + { + "epoch": 1.875135031793673, + "grad_norm": 9.401656150817871, + "learning_rate": 1.874774947010545e-05, + "loss": 2.7038, + "step": 6032000 + }, + { + "epoch": 1.87529046407416, + "grad_norm": 8.976663589477539, + "learning_rate": 1.8745158932097335e-05, + "loss": 2.7111, + "step": 6032500 + }, + { + "epoch": 1.8754458963546468, + "grad_norm": 8.971589088439941, + "learning_rate": 1.8742568394089222e-05, + "loss": 2.7159, + "step": 6033000 + }, + { + "epoch": 1.8756013286351336, + "grad_norm": 21.75731658935547, + "learning_rate": 1.8739977856081106e-05, + "loss": 2.7263, + "step": 6033500 + }, + { + "epoch": 1.8757567609156205, + "grad_norm": 9.351910591125488, + "learning_rate": 1.8737387318072993e-05, + "loss": 2.6821, + "step": 6034000 + }, + { + "epoch": 1.8759121931961074, + "grad_norm": 10.503827095031738, + "learning_rate": 1.8734796780064877e-05, + "loss": 2.7148, + "step": 6034500 + }, + { + "epoch": 1.8760676254765942, + "grad_norm": 10.117318153381348, + "learning_rate": 1.8732206242056764e-05, + "loss": 2.7375, + "step": 6035000 + }, + { + "epoch": 1.876223057757081, + "grad_norm": 8.380512237548828, + "learning_rate": 1.872961570404865e-05, + "loss": 2.7362, + "step": 6035500 + }, + { + "epoch": 1.876378490037568, + "grad_norm": 11.157049179077148, + "learning_rate": 1.8727025166040535e-05, + "loss": 2.7252, + "step": 6036000 + }, + { + "epoch": 1.8765339223180548, + "grad_norm": 11.472481727600098, + "learning_rate": 1.872443462803242e-05, + "loss": 2.7483, + "step": 6036500 + }, + { + "epoch": 1.8766893545985417, + "grad_norm": 17.593271255493164, + "learning_rate": 1.8721844090024303e-05, + "loss": 2.7368, + "step": 6037000 + }, + { + "epoch": 1.8768447868790286, + "grad_norm": 12.192160606384277, + "learning_rate": 1.871925355201619e-05, + "loss": 2.6991, + "step": 6037500 + }, + { + "epoch": 1.8770002191595156, + "grad_norm": 13.788349151611328, + "learning_rate": 1.8716663014008077e-05, + "loss": 2.7407, + "step": 6038000 + }, + { + "epoch": 1.8771556514400025, + "grad_norm": 8.71782398223877, + "learning_rate": 1.871407247599996e-05, + "loss": 2.7036, + "step": 6038500 + }, + { + "epoch": 1.8773110837204894, + "grad_norm": 9.492671012878418, + "learning_rate": 1.8711481937991848e-05, + "loss": 2.7125, + "step": 6039000 + }, + { + "epoch": 1.8774665160009762, + "grad_norm": 7.4249725341796875, + "learning_rate": 1.8708891399983732e-05, + "loss": 2.6886, + "step": 6039500 + }, + { + "epoch": 1.8776219482814631, + "grad_norm": 19.955474853515625, + "learning_rate": 1.870630086197562e-05, + "loss": 2.7155, + "step": 6040000 + }, + { + "epoch": 1.87777738056195, + "grad_norm": 17.69347381591797, + "learning_rate": 1.8703710323967503e-05, + "loss": 2.6741, + "step": 6040500 + }, + { + "epoch": 1.8779328128424368, + "grad_norm": 10.114870071411133, + "learning_rate": 1.870111978595939e-05, + "loss": 2.6639, + "step": 6041000 + }, + { + "epoch": 1.8780882451229237, + "grad_norm": 39.340492248535156, + "learning_rate": 1.8698529247951274e-05, + "loss": 2.7014, + "step": 6041500 + }, + { + "epoch": 1.8782436774034106, + "grad_norm": 11.32135009765625, + "learning_rate": 1.8695938709943157e-05, + "loss": 2.7757, + "step": 6042000 + }, + { + "epoch": 1.8783991096838974, + "grad_norm": 15.372342109680176, + "learning_rate": 1.8693348171935045e-05, + "loss": 2.6636, + "step": 6042500 + }, + { + "epoch": 1.8785545419643843, + "grad_norm": 7.3391900062561035, + "learning_rate": 1.869075763392693e-05, + "loss": 2.7525, + "step": 6043000 + }, + { + "epoch": 1.8787099742448712, + "grad_norm": 8.188575744628906, + "learning_rate": 1.8688167095918815e-05, + "loss": 2.7097, + "step": 6043500 + }, + { + "epoch": 1.878865406525358, + "grad_norm": 14.935492515563965, + "learning_rate": 1.86855765579107e-05, + "loss": 2.7491, + "step": 6044000 + }, + { + "epoch": 1.879020838805845, + "grad_norm": 7.5531721115112305, + "learning_rate": 1.8682986019902586e-05, + "loss": 2.699, + "step": 6044500 + }, + { + "epoch": 1.8791762710863318, + "grad_norm": 9.33746337890625, + "learning_rate": 1.8680395481894473e-05, + "loss": 2.7747, + "step": 6045000 + }, + { + "epoch": 1.8793317033668187, + "grad_norm": 11.011147499084473, + "learning_rate": 1.8677804943886357e-05, + "loss": 2.7114, + "step": 6045500 + }, + { + "epoch": 1.8794871356473055, + "grad_norm": 19.851526260375977, + "learning_rate": 1.867521440587824e-05, + "loss": 2.7334, + "step": 6046000 + }, + { + "epoch": 1.8796425679277924, + "grad_norm": 9.73703670501709, + "learning_rate": 1.8672623867870128e-05, + "loss": 2.7201, + "step": 6046500 + }, + { + "epoch": 1.8797980002082793, + "grad_norm": 6.5954461097717285, + "learning_rate": 1.8670033329862012e-05, + "loss": 2.7813, + "step": 6047000 + }, + { + "epoch": 1.8799534324887661, + "grad_norm": 9.550644874572754, + "learning_rate": 1.86674427918539e-05, + "loss": 2.6985, + "step": 6047500 + }, + { + "epoch": 1.880108864769253, + "grad_norm": 15.168713569641113, + "learning_rate": 1.8664852253845786e-05, + "loss": 2.7014, + "step": 6048000 + }, + { + "epoch": 1.8802642970497399, + "grad_norm": 9.837081909179688, + "learning_rate": 1.866226171583767e-05, + "loss": 2.6911, + "step": 6048500 + }, + { + "epoch": 1.8804197293302267, + "grad_norm": 9.190340995788574, + "learning_rate": 1.8659671177829554e-05, + "loss": 2.6706, + "step": 6049000 + }, + { + "epoch": 1.8805751616107136, + "grad_norm": 11.856093406677246, + "learning_rate": 1.865708063982144e-05, + "loss": 2.7, + "step": 6049500 + }, + { + "epoch": 1.8807305938912005, + "grad_norm": 8.90710163116455, + "learning_rate": 1.8654490101813328e-05, + "loss": 2.7177, + "step": 6050000 + }, + { + "epoch": 1.8808860261716873, + "grad_norm": 13.49462604522705, + "learning_rate": 1.8651899563805212e-05, + "loss": 2.6929, + "step": 6050500 + }, + { + "epoch": 1.8810414584521742, + "grad_norm": 6.926163196563721, + "learning_rate": 1.8649309025797096e-05, + "loss": 2.7127, + "step": 6051000 + }, + { + "epoch": 1.881196890732661, + "grad_norm": 9.459495544433594, + "learning_rate": 1.864671848778898e-05, + "loss": 2.6951, + "step": 6051500 + }, + { + "epoch": 1.881352323013148, + "grad_norm": 14.9026460647583, + "learning_rate": 1.8644127949780867e-05, + "loss": 2.6901, + "step": 6052000 + }, + { + "epoch": 1.8815077552936348, + "grad_norm": 8.664196968078613, + "learning_rate": 1.8641537411772754e-05, + "loss": 2.6703, + "step": 6052500 + }, + { + "epoch": 1.8816631875741217, + "grad_norm": 10.499333381652832, + "learning_rate": 1.8638946873764637e-05, + "loss": 2.7071, + "step": 6053000 + }, + { + "epoch": 1.8818186198546085, + "grad_norm": 10.557357788085938, + "learning_rate": 1.8636356335756525e-05, + "loss": 2.7381, + "step": 6053500 + }, + { + "epoch": 1.8819740521350954, + "grad_norm": 16.89124298095703, + "learning_rate": 1.863376579774841e-05, + "loss": 2.7078, + "step": 6054000 + }, + { + "epoch": 1.8821294844155823, + "grad_norm": 9.26371955871582, + "learning_rate": 1.8631175259740296e-05, + "loss": 2.7305, + "step": 6054500 + }, + { + "epoch": 1.8822849166960691, + "grad_norm": 14.915901184082031, + "learning_rate": 1.862858472173218e-05, + "loss": 2.6838, + "step": 6055000 + }, + { + "epoch": 1.882440348976556, + "grad_norm": 11.99267578125, + "learning_rate": 1.8625994183724066e-05, + "loss": 2.7618, + "step": 6055500 + }, + { + "epoch": 1.882595781257043, + "grad_norm": 9.433612823486328, + "learning_rate": 1.862340364571595e-05, + "loss": 2.6619, + "step": 6056000 + }, + { + "epoch": 1.88275121353753, + "grad_norm": 9.218669891357422, + "learning_rate": 1.8620813107707834e-05, + "loss": 2.6771, + "step": 6056500 + }, + { + "epoch": 1.8829066458180168, + "grad_norm": 9.089394569396973, + "learning_rate": 1.861822256969972e-05, + "loss": 2.7736, + "step": 6057000 + }, + { + "epoch": 1.8830620780985037, + "grad_norm": 11.156364440917969, + "learning_rate": 1.8615632031691608e-05, + "loss": 2.7231, + "step": 6057500 + }, + { + "epoch": 1.8832175103789905, + "grad_norm": 9.855628967285156, + "learning_rate": 1.8613041493683492e-05, + "loss": 2.7396, + "step": 6058000 + }, + { + "epoch": 1.8833729426594774, + "grad_norm": 10.361088752746582, + "learning_rate": 1.8610450955675376e-05, + "loss": 2.7034, + "step": 6058500 + }, + { + "epoch": 1.8835283749399643, + "grad_norm": 8.391470909118652, + "learning_rate": 1.8607860417667263e-05, + "loss": 2.7282, + "step": 6059000 + }, + { + "epoch": 1.8836838072204511, + "grad_norm": 10.424560546875, + "learning_rate": 1.860526987965915e-05, + "loss": 2.699, + "step": 6059500 + }, + { + "epoch": 1.883839239500938, + "grad_norm": 10.478599548339844, + "learning_rate": 1.8602679341651034e-05, + "loss": 2.7637, + "step": 6060000 + }, + { + "epoch": 1.8839946717814249, + "grad_norm": 11.02152156829834, + "learning_rate": 1.8600088803642918e-05, + "loss": 2.7227, + "step": 6060500 + }, + { + "epoch": 1.8841501040619117, + "grad_norm": 7.265111446380615, + "learning_rate": 1.8597498265634805e-05, + "loss": 2.7431, + "step": 6061000 + }, + { + "epoch": 1.8843055363423986, + "grad_norm": 9.601414680480957, + "learning_rate": 1.859490772762669e-05, + "loss": 2.709, + "step": 6061500 + }, + { + "epoch": 1.8844609686228857, + "grad_norm": 10.958948135375977, + "learning_rate": 1.8592317189618576e-05, + "loss": 2.7153, + "step": 6062000 + }, + { + "epoch": 1.8846164009033726, + "grad_norm": 9.77261734008789, + "learning_rate": 1.8589726651610463e-05, + "loss": 2.7217, + "step": 6062500 + }, + { + "epoch": 1.8847718331838594, + "grad_norm": 11.488658905029297, + "learning_rate": 1.8587136113602347e-05, + "loss": 2.7572, + "step": 6063000 + }, + { + "epoch": 1.8849272654643463, + "grad_norm": 10.435378074645996, + "learning_rate": 1.858454557559423e-05, + "loss": 2.7148, + "step": 6063500 + }, + { + "epoch": 1.8850826977448332, + "grad_norm": 8.582566261291504, + "learning_rate": 1.8581955037586114e-05, + "loss": 2.7025, + "step": 6064000 + }, + { + "epoch": 1.88523813002532, + "grad_norm": 10.83251667022705, + "learning_rate": 1.8579364499578005e-05, + "loss": 2.7167, + "step": 6064500 + }, + { + "epoch": 1.885393562305807, + "grad_norm": 12.120450019836426, + "learning_rate": 1.857677396156989e-05, + "loss": 2.7117, + "step": 6065000 + }, + { + "epoch": 1.8855489945862938, + "grad_norm": 8.69562816619873, + "learning_rate": 1.8574183423561772e-05, + "loss": 2.7014, + "step": 6065500 + }, + { + "epoch": 1.8857044268667806, + "grad_norm": 9.062617301940918, + "learning_rate": 1.857159288555366e-05, + "loss": 2.7322, + "step": 6066000 + }, + { + "epoch": 1.8858598591472675, + "grad_norm": 9.29155445098877, + "learning_rate": 1.8569002347545543e-05, + "loss": 2.7386, + "step": 6066500 + }, + { + "epoch": 1.8860152914277544, + "grad_norm": 13.73889446258545, + "learning_rate": 1.856641180953743e-05, + "loss": 2.7312, + "step": 6067000 + }, + { + "epoch": 1.8861707237082412, + "grad_norm": 9.133115768432617, + "learning_rate": 1.8563821271529314e-05, + "loss": 2.6853, + "step": 6067500 + }, + { + "epoch": 1.886326155988728, + "grad_norm": 9.437515258789062, + "learning_rate": 1.85612307335212e-05, + "loss": 2.6926, + "step": 6068000 + }, + { + "epoch": 1.886481588269215, + "grad_norm": 8.056296348571777, + "learning_rate": 1.8558640195513085e-05, + "loss": 2.6855, + "step": 6068500 + }, + { + "epoch": 1.8866370205497018, + "grad_norm": 13.29648208618164, + "learning_rate": 1.855604965750497e-05, + "loss": 2.7208, + "step": 6069000 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 7.373196601867676, + "learning_rate": 1.8553459119496856e-05, + "loss": 2.7138, + "step": 6069500 + }, + { + "epoch": 1.8869478851106756, + "grad_norm": 8.215150833129883, + "learning_rate": 1.8550868581488743e-05, + "loss": 2.7253, + "step": 6070000 + }, + { + "epoch": 1.8871033173911624, + "grad_norm": 9.751851081848145, + "learning_rate": 1.8548278043480627e-05, + "loss": 2.7758, + "step": 6070500 + }, + { + "epoch": 1.8872587496716493, + "grad_norm": 10.355398178100586, + "learning_rate": 1.854568750547251e-05, + "loss": 2.696, + "step": 6071000 + }, + { + "epoch": 1.8874141819521362, + "grad_norm": 8.30056095123291, + "learning_rate": 1.8543096967464398e-05, + "loss": 2.655, + "step": 6071500 + }, + { + "epoch": 1.887569614232623, + "grad_norm": 8.964468002319336, + "learning_rate": 1.8540506429456285e-05, + "loss": 2.7096, + "step": 6072000 + }, + { + "epoch": 1.88772504651311, + "grad_norm": 30.346881866455078, + "learning_rate": 1.853791589144817e-05, + "loss": 2.755, + "step": 6072500 + }, + { + "epoch": 1.8878804787935968, + "grad_norm": 16.825115203857422, + "learning_rate": 1.8535325353440052e-05, + "loss": 2.6952, + "step": 6073000 + }, + { + "epoch": 1.8880359110740836, + "grad_norm": 15.421774864196777, + "learning_rate": 1.853273481543194e-05, + "loss": 2.7026, + "step": 6073500 + }, + { + "epoch": 1.8881913433545705, + "grad_norm": 13.362330436706543, + "learning_rate": 1.8530144277423823e-05, + "loss": 2.7083, + "step": 6074000 + }, + { + "epoch": 1.8883467756350574, + "grad_norm": 13.622397422790527, + "learning_rate": 1.852755373941571e-05, + "loss": 2.729, + "step": 6074500 + }, + { + "epoch": 1.8885022079155442, + "grad_norm": 12.68899154663086, + "learning_rate": 1.8524963201407598e-05, + "loss": 2.7632, + "step": 6075000 + }, + { + "epoch": 1.888657640196031, + "grad_norm": 9.259654998779297, + "learning_rate": 1.852237266339948e-05, + "loss": 2.7171, + "step": 6075500 + }, + { + "epoch": 1.888813072476518, + "grad_norm": 11.168353080749512, + "learning_rate": 1.8519782125391365e-05, + "loss": 2.732, + "step": 6076000 + }, + { + "epoch": 1.8889685047570048, + "grad_norm": 10.406055450439453, + "learning_rate": 1.8517191587383252e-05, + "loss": 2.6859, + "step": 6076500 + }, + { + "epoch": 1.8891239370374917, + "grad_norm": 7.902820587158203, + "learning_rate": 1.851460104937514e-05, + "loss": 2.7217, + "step": 6077000 + }, + { + "epoch": 1.8892793693179786, + "grad_norm": 8.493569374084473, + "learning_rate": 1.8512010511367023e-05, + "loss": 2.6921, + "step": 6077500 + }, + { + "epoch": 1.8894348015984654, + "grad_norm": 8.454635620117188, + "learning_rate": 1.8509419973358907e-05, + "loss": 2.7048, + "step": 6078000 + }, + { + "epoch": 1.8895902338789523, + "grad_norm": 11.098021507263184, + "learning_rate": 1.850682943535079e-05, + "loss": 2.6748, + "step": 6078500 + }, + { + "epoch": 1.8897456661594392, + "grad_norm": 11.542237281799316, + "learning_rate": 1.8504238897342678e-05, + "loss": 2.718, + "step": 6079000 + }, + { + "epoch": 1.889901098439926, + "grad_norm": 9.82749080657959, + "learning_rate": 1.8501648359334565e-05, + "loss": 2.7499, + "step": 6079500 + }, + { + "epoch": 1.8900565307204131, + "grad_norm": 15.421232223510742, + "learning_rate": 1.849905782132645e-05, + "loss": 2.7387, + "step": 6080000 + }, + { + "epoch": 1.8902119630009, + "grad_norm": 8.902820587158203, + "learning_rate": 1.8496467283318336e-05, + "loss": 2.6958, + "step": 6080500 + }, + { + "epoch": 1.8903673952813869, + "grad_norm": 10.064388275146484, + "learning_rate": 1.849387674531022e-05, + "loss": 2.7049, + "step": 6081000 + }, + { + "epoch": 1.8905228275618737, + "grad_norm": 11.886594772338867, + "learning_rate": 1.8491286207302107e-05, + "loss": 2.7179, + "step": 6081500 + }, + { + "epoch": 1.8906782598423606, + "grad_norm": 11.532355308532715, + "learning_rate": 1.848869566929399e-05, + "loss": 2.7153, + "step": 6082000 + }, + { + "epoch": 1.8908336921228475, + "grad_norm": 11.148540496826172, + "learning_rate": 1.8486105131285878e-05, + "loss": 2.7512, + "step": 6082500 + }, + { + "epoch": 1.8909891244033343, + "grad_norm": 9.121857643127441, + "learning_rate": 1.848351459327776e-05, + "loss": 2.7388, + "step": 6083000 + }, + { + "epoch": 1.8911445566838212, + "grad_norm": 10.094294548034668, + "learning_rate": 1.8480924055269645e-05, + "loss": 2.7019, + "step": 6083500 + }, + { + "epoch": 1.891299988964308, + "grad_norm": 9.413198471069336, + "learning_rate": 1.8478333517261533e-05, + "loss": 2.7315, + "step": 6084000 + }, + { + "epoch": 1.891455421244795, + "grad_norm": 9.075212478637695, + "learning_rate": 1.847574297925342e-05, + "loss": 2.7276, + "step": 6084500 + }, + { + "epoch": 1.8916108535252818, + "grad_norm": 8.681662559509277, + "learning_rate": 1.8473152441245303e-05, + "loss": 2.6796, + "step": 6085000 + }, + { + "epoch": 1.8917662858057687, + "grad_norm": 9.684221267700195, + "learning_rate": 1.8470561903237187e-05, + "loss": 2.7127, + "step": 6085500 + }, + { + "epoch": 1.8919217180862558, + "grad_norm": 10.729865074157715, + "learning_rate": 1.8467971365229074e-05, + "loss": 2.7067, + "step": 6086000 + }, + { + "epoch": 1.8920771503667426, + "grad_norm": 11.026744842529297, + "learning_rate": 1.846538082722096e-05, + "loss": 2.6919, + "step": 6086500 + }, + { + "epoch": 1.8922325826472295, + "grad_norm": 10.034345626831055, + "learning_rate": 1.8462790289212845e-05, + "loss": 2.6503, + "step": 6087000 + }, + { + "epoch": 1.8923880149277164, + "grad_norm": 10.026107788085938, + "learning_rate": 1.846019975120473e-05, + "loss": 2.6722, + "step": 6087500 + }, + { + "epoch": 1.8925434472082032, + "grad_norm": 10.22282600402832, + "learning_rate": 1.8457609213196616e-05, + "loss": 2.7189, + "step": 6088000 + }, + { + "epoch": 1.89269887948869, + "grad_norm": 9.809000015258789, + "learning_rate": 1.84550186751885e-05, + "loss": 2.7213, + "step": 6088500 + }, + { + "epoch": 1.892854311769177, + "grad_norm": 10.921399116516113, + "learning_rate": 1.8452428137180387e-05, + "loss": 2.7431, + "step": 6089000 + }, + { + "epoch": 1.8930097440496638, + "grad_norm": 8.633552551269531, + "learning_rate": 1.8449837599172274e-05, + "loss": 2.6303, + "step": 6089500 + }, + { + "epoch": 1.8931651763301507, + "grad_norm": 13.906764030456543, + "learning_rate": 1.8447247061164158e-05, + "loss": 2.7053, + "step": 6090000 + }, + { + "epoch": 1.8933206086106376, + "grad_norm": 8.852465629577637, + "learning_rate": 1.8444656523156042e-05, + "loss": 2.697, + "step": 6090500 + }, + { + "epoch": 1.8934760408911244, + "grad_norm": 8.520367622375488, + "learning_rate": 1.8442065985147926e-05, + "loss": 2.665, + "step": 6091000 + }, + { + "epoch": 1.8936314731716113, + "grad_norm": 7.887214183807373, + "learning_rate": 1.8439475447139816e-05, + "loss": 2.737, + "step": 6091500 + }, + { + "epoch": 1.8937869054520982, + "grad_norm": 10.222151756286621, + "learning_rate": 1.84368849091317e-05, + "loss": 2.6815, + "step": 6092000 + }, + { + "epoch": 1.893942337732585, + "grad_norm": 9.520700454711914, + "learning_rate": 1.8434294371123584e-05, + "loss": 2.7221, + "step": 6092500 + }, + { + "epoch": 1.894097770013072, + "grad_norm": 10.22057056427002, + "learning_rate": 1.843170383311547e-05, + "loss": 2.7187, + "step": 6093000 + }, + { + "epoch": 1.8942532022935588, + "grad_norm": 7.4371113777160645, + "learning_rate": 1.8429113295107355e-05, + "loss": 2.6617, + "step": 6093500 + }, + { + "epoch": 1.8944086345740456, + "grad_norm": 10.447236061096191, + "learning_rate": 1.8426522757099242e-05, + "loss": 2.724, + "step": 6094000 + }, + { + "epoch": 1.8945640668545325, + "grad_norm": 11.327098846435547, + "learning_rate": 1.8423932219091125e-05, + "loss": 2.6825, + "step": 6094500 + }, + { + "epoch": 1.8947194991350194, + "grad_norm": 12.18635368347168, + "learning_rate": 1.8421341681083013e-05, + "loss": 2.6713, + "step": 6095000 + }, + { + "epoch": 1.8948749314155062, + "grad_norm": 9.019115447998047, + "learning_rate": 1.8418751143074896e-05, + "loss": 2.7468, + "step": 6095500 + }, + { + "epoch": 1.895030363695993, + "grad_norm": 9.908740997314453, + "learning_rate": 1.841616060506678e-05, + "loss": 2.6816, + "step": 6096000 + }, + { + "epoch": 1.89518579597648, + "grad_norm": 10.788138389587402, + "learning_rate": 1.8413570067058667e-05, + "loss": 2.7289, + "step": 6096500 + }, + { + "epoch": 1.8953412282569668, + "grad_norm": 8.844636917114258, + "learning_rate": 1.8410979529050554e-05, + "loss": 2.7406, + "step": 6097000 + }, + { + "epoch": 1.8954966605374537, + "grad_norm": 10.194693565368652, + "learning_rate": 1.8408388991042438e-05, + "loss": 2.7442, + "step": 6097500 + }, + { + "epoch": 1.8956520928179406, + "grad_norm": 9.452268600463867, + "learning_rate": 1.8405798453034322e-05, + "loss": 2.7499, + "step": 6098000 + }, + { + "epoch": 1.8958075250984274, + "grad_norm": 9.130873680114746, + "learning_rate": 1.840320791502621e-05, + "loss": 2.7031, + "step": 6098500 + }, + { + "epoch": 1.8959629573789143, + "grad_norm": 9.175013542175293, + "learning_rate": 1.8400617377018096e-05, + "loss": 2.7186, + "step": 6099000 + }, + { + "epoch": 1.8961183896594012, + "grad_norm": 8.739191055297852, + "learning_rate": 1.839802683900998e-05, + "loss": 2.6803, + "step": 6099500 + }, + { + "epoch": 1.896273821939888, + "grad_norm": 9.071808815002441, + "learning_rate": 1.8395436301001864e-05, + "loss": 2.6792, + "step": 6100000 + }, + { + "epoch": 1.896429254220375, + "grad_norm": 8.720965385437012, + "learning_rate": 1.839284576299375e-05, + "loss": 2.782, + "step": 6100500 + }, + { + "epoch": 1.8965846865008618, + "grad_norm": 8.381579399108887, + "learning_rate": 1.8390255224985635e-05, + "loss": 2.7126, + "step": 6101000 + }, + { + "epoch": 1.8967401187813486, + "grad_norm": 8.0128812789917, + "learning_rate": 1.8387664686977522e-05, + "loss": 2.6611, + "step": 6101500 + }, + { + "epoch": 1.8968955510618355, + "grad_norm": 11.198123931884766, + "learning_rate": 1.838507414896941e-05, + "loss": 2.7208, + "step": 6102000 + }, + { + "epoch": 1.8970509833423224, + "grad_norm": 10.618398666381836, + "learning_rate": 1.8382483610961293e-05, + "loss": 2.7053, + "step": 6102500 + }, + { + "epoch": 1.8972064156228092, + "grad_norm": 22.99637222290039, + "learning_rate": 1.8379893072953177e-05, + "loss": 2.7032, + "step": 6103000 + }, + { + "epoch": 1.897361847903296, + "grad_norm": 55.90673065185547, + "learning_rate": 1.8377302534945064e-05, + "loss": 2.689, + "step": 6103500 + }, + { + "epoch": 1.8975172801837832, + "grad_norm": 9.105853080749512, + "learning_rate": 1.837471199693695e-05, + "loss": 2.7334, + "step": 6104000 + }, + { + "epoch": 1.89767271246427, + "grad_norm": 7.960531711578369, + "learning_rate": 1.8372121458928835e-05, + "loss": 2.7343, + "step": 6104500 + }, + { + "epoch": 1.897828144744757, + "grad_norm": 9.11685848236084, + "learning_rate": 1.836953092092072e-05, + "loss": 2.7558, + "step": 6105000 + }, + { + "epoch": 1.8979835770252438, + "grad_norm": 8.778807640075684, + "learning_rate": 1.8366940382912602e-05, + "loss": 2.694, + "step": 6105500 + }, + { + "epoch": 1.8981390093057307, + "grad_norm": 11.667305946350098, + "learning_rate": 1.836434984490449e-05, + "loss": 2.7148, + "step": 6106000 + }, + { + "epoch": 1.8982944415862175, + "grad_norm": 8.481538772583008, + "learning_rate": 1.8361759306896377e-05, + "loss": 2.7092, + "step": 6106500 + }, + { + "epoch": 1.8984498738667044, + "grad_norm": 7.901231288909912, + "learning_rate": 1.835916876888826e-05, + "loss": 2.7108, + "step": 6107000 + }, + { + "epoch": 1.8986053061471913, + "grad_norm": 10.16264820098877, + "learning_rate": 1.8356578230880147e-05, + "loss": 2.7172, + "step": 6107500 + }, + { + "epoch": 1.8987607384276781, + "grad_norm": 10.545754432678223, + "learning_rate": 1.835398769287203e-05, + "loss": 2.7433, + "step": 6108000 + }, + { + "epoch": 1.898916170708165, + "grad_norm": 10.881854057312012, + "learning_rate": 1.835139715486392e-05, + "loss": 2.718, + "step": 6108500 + }, + { + "epoch": 1.8990716029886519, + "grad_norm": 11.235732078552246, + "learning_rate": 1.8348806616855802e-05, + "loss": 2.7373, + "step": 6109000 + }, + { + "epoch": 1.8992270352691387, + "grad_norm": 8.11071491241455, + "learning_rate": 1.834621607884769e-05, + "loss": 2.7337, + "step": 6109500 + }, + { + "epoch": 1.8993824675496256, + "grad_norm": 9.416751861572266, + "learning_rate": 1.8343625540839573e-05, + "loss": 2.7457, + "step": 6110000 + }, + { + "epoch": 1.8995378998301127, + "grad_norm": 33.48689270019531, + "learning_rate": 1.8341035002831457e-05, + "loss": 2.7445, + "step": 6110500 + }, + { + "epoch": 1.8996933321105995, + "grad_norm": 8.771708488464355, + "learning_rate": 1.8338444464823344e-05, + "loss": 2.6863, + "step": 6111000 + }, + { + "epoch": 1.8998487643910864, + "grad_norm": 17.862924575805664, + "learning_rate": 1.833585392681523e-05, + "loss": 2.711, + "step": 6111500 + }, + { + "epoch": 1.9000041966715733, + "grad_norm": 14.906447410583496, + "learning_rate": 1.8333263388807115e-05, + "loss": 2.7575, + "step": 6112000 + }, + { + "epoch": 1.9001596289520601, + "grad_norm": 10.901904106140137, + "learning_rate": 1.8330672850799e-05, + "loss": 2.6956, + "step": 6112500 + }, + { + "epoch": 1.900315061232547, + "grad_norm": 9.512120246887207, + "learning_rate": 1.8328082312790886e-05, + "loss": 2.7199, + "step": 6113000 + }, + { + "epoch": 1.9004704935130339, + "grad_norm": 8.699094772338867, + "learning_rate": 1.8325491774782773e-05, + "loss": 2.7394, + "step": 6113500 + }, + { + "epoch": 1.9006259257935207, + "grad_norm": 12.339881896972656, + "learning_rate": 1.8322901236774657e-05, + "loss": 2.6878, + "step": 6114000 + }, + { + "epoch": 1.9007813580740076, + "grad_norm": 9.332329750061035, + "learning_rate": 1.8320310698766544e-05, + "loss": 2.7165, + "step": 6114500 + }, + { + "epoch": 1.9009367903544945, + "grad_norm": 9.674449920654297, + "learning_rate": 1.8317720160758428e-05, + "loss": 2.6953, + "step": 6115000 + }, + { + "epoch": 1.9010922226349813, + "grad_norm": 41.91887283325195, + "learning_rate": 1.831512962275031e-05, + "loss": 2.6765, + "step": 6115500 + }, + { + "epoch": 1.9012476549154682, + "grad_norm": 9.805656433105469, + "learning_rate": 1.83125390847422e-05, + "loss": 2.703, + "step": 6116000 + }, + { + "epoch": 1.901403087195955, + "grad_norm": 7.579477787017822, + "learning_rate": 1.8309948546734086e-05, + "loss": 2.7176, + "step": 6116500 + }, + { + "epoch": 1.901558519476442, + "grad_norm": 13.664706230163574, + "learning_rate": 1.830735800872597e-05, + "loss": 2.7323, + "step": 6117000 + }, + { + "epoch": 1.9017139517569288, + "grad_norm": 9.18190860748291, + "learning_rate": 1.8304767470717853e-05, + "loss": 2.7117, + "step": 6117500 + }, + { + "epoch": 1.9018693840374157, + "grad_norm": 14.912920951843262, + "learning_rate": 1.8302176932709737e-05, + "loss": 2.7028, + "step": 6118000 + }, + { + "epoch": 1.9020248163179025, + "grad_norm": 11.257734298706055, + "learning_rate": 1.8299586394701628e-05, + "loss": 2.7353, + "step": 6118500 + }, + { + "epoch": 1.9021802485983894, + "grad_norm": 10.13015079498291, + "learning_rate": 1.829699585669351e-05, + "loss": 2.6796, + "step": 6119000 + }, + { + "epoch": 1.9023356808788763, + "grad_norm": 10.239930152893066, + "learning_rate": 1.8294405318685395e-05, + "loss": 2.7375, + "step": 6119500 + }, + { + "epoch": 1.9024911131593631, + "grad_norm": 10.078268051147461, + "learning_rate": 1.8291814780677282e-05, + "loss": 2.7098, + "step": 6120000 + }, + { + "epoch": 1.90264654543985, + "grad_norm": 9.091398239135742, + "learning_rate": 1.8289224242669166e-05, + "loss": 2.6986, + "step": 6120500 + }, + { + "epoch": 1.9028019777203369, + "grad_norm": 10.281903266906738, + "learning_rate": 1.8286633704661053e-05, + "loss": 2.6504, + "step": 6121000 + }, + { + "epoch": 1.9029574100008237, + "grad_norm": 9.511797904968262, + "learning_rate": 1.8284043166652937e-05, + "loss": 2.7006, + "step": 6121500 + }, + { + "epoch": 1.9031128422813106, + "grad_norm": 11.264223098754883, + "learning_rate": 1.8281452628644824e-05, + "loss": 2.7351, + "step": 6122000 + }, + { + "epoch": 1.9032682745617975, + "grad_norm": 7.564523220062256, + "learning_rate": 1.8278862090636708e-05, + "loss": 2.7371, + "step": 6122500 + }, + { + "epoch": 1.9034237068422843, + "grad_norm": 8.220922470092773, + "learning_rate": 1.827627155262859e-05, + "loss": 2.6722, + "step": 6123000 + }, + { + "epoch": 1.9035791391227712, + "grad_norm": 10.321487426757812, + "learning_rate": 1.8273681014620482e-05, + "loss": 2.7138, + "step": 6123500 + }, + { + "epoch": 1.903734571403258, + "grad_norm": 20.345972061157227, + "learning_rate": 1.8271090476612366e-05, + "loss": 2.7119, + "step": 6124000 + }, + { + "epoch": 1.903890003683745, + "grad_norm": 18.820646286010742, + "learning_rate": 1.826849993860425e-05, + "loss": 2.6705, + "step": 6124500 + }, + { + "epoch": 1.9040454359642318, + "grad_norm": 8.551374435424805, + "learning_rate": 1.8265909400596133e-05, + "loss": 2.7067, + "step": 6125000 + }, + { + "epoch": 1.9042008682447187, + "grad_norm": 11.271453857421875, + "learning_rate": 1.826331886258802e-05, + "loss": 2.7762, + "step": 6125500 + }, + { + "epoch": 1.9043563005252055, + "grad_norm": 11.194074630737305, + "learning_rate": 1.8260728324579908e-05, + "loss": 2.7058, + "step": 6126000 + }, + { + "epoch": 1.9045117328056924, + "grad_norm": 13.982295036315918, + "learning_rate": 1.825813778657179e-05, + "loss": 2.7226, + "step": 6126500 + }, + { + "epoch": 1.9046671650861793, + "grad_norm": 10.240300178527832, + "learning_rate": 1.8255547248563675e-05, + "loss": 2.7321, + "step": 6127000 + }, + { + "epoch": 1.9048225973666661, + "grad_norm": 8.249021530151367, + "learning_rate": 1.8252956710555562e-05, + "loss": 2.7231, + "step": 6127500 + }, + { + "epoch": 1.904978029647153, + "grad_norm": 10.862166404724121, + "learning_rate": 1.8250366172547446e-05, + "loss": 2.7049, + "step": 6128000 + }, + { + "epoch": 1.90513346192764, + "grad_norm": 10.152064323425293, + "learning_rate": 1.8247775634539333e-05, + "loss": 2.6812, + "step": 6128500 + }, + { + "epoch": 1.905288894208127, + "grad_norm": 10.707205772399902, + "learning_rate": 1.824518509653122e-05, + "loss": 2.66, + "step": 6129000 + }, + { + "epoch": 1.9054443264886138, + "grad_norm": 14.845879554748535, + "learning_rate": 1.8242594558523104e-05, + "loss": 2.7006, + "step": 6129500 + }, + { + "epoch": 1.9055997587691007, + "grad_norm": 8.115496635437012, + "learning_rate": 1.8240004020514988e-05, + "loss": 2.6841, + "step": 6130000 + }, + { + "epoch": 1.9057551910495876, + "grad_norm": 10.62258243560791, + "learning_rate": 1.8237413482506875e-05, + "loss": 2.7237, + "step": 6130500 + }, + { + "epoch": 1.9059106233300744, + "grad_norm": 10.730562210083008, + "learning_rate": 1.8234822944498762e-05, + "loss": 2.6897, + "step": 6131000 + }, + { + "epoch": 1.9060660556105613, + "grad_norm": 9.531167984008789, + "learning_rate": 1.8232232406490646e-05, + "loss": 2.7019, + "step": 6131500 + }, + { + "epoch": 1.9062214878910482, + "grad_norm": 9.73584270477295, + "learning_rate": 1.822964186848253e-05, + "loss": 2.711, + "step": 6132000 + }, + { + "epoch": 1.906376920171535, + "grad_norm": 9.167436599731445, + "learning_rate": 1.8227051330474417e-05, + "loss": 2.6768, + "step": 6132500 + }, + { + "epoch": 1.906532352452022, + "grad_norm": 9.175935745239258, + "learning_rate": 1.82244607924663e-05, + "loss": 2.7372, + "step": 6133000 + }, + { + "epoch": 1.9066877847325088, + "grad_norm": 10.711585998535156, + "learning_rate": 1.8221870254458188e-05, + "loss": 2.7292, + "step": 6133500 + }, + { + "epoch": 1.9068432170129956, + "grad_norm": 9.865829467773438, + "learning_rate": 1.8219279716450072e-05, + "loss": 2.6979, + "step": 6134000 + }, + { + "epoch": 1.9069986492934827, + "grad_norm": 10.409299850463867, + "learning_rate": 1.821668917844196e-05, + "loss": 2.68, + "step": 6134500 + }, + { + "epoch": 1.9071540815739696, + "grad_norm": 14.536827087402344, + "learning_rate": 1.8214098640433843e-05, + "loss": 2.6911, + "step": 6135000 + }, + { + "epoch": 1.9073095138544565, + "grad_norm": 10.6654052734375, + "learning_rate": 1.821150810242573e-05, + "loss": 2.7731, + "step": 6135500 + }, + { + "epoch": 1.9074649461349433, + "grad_norm": 9.315185546875, + "learning_rate": 1.8208917564417614e-05, + "loss": 2.6769, + "step": 6136000 + }, + { + "epoch": 1.9076203784154302, + "grad_norm": 8.580126762390137, + "learning_rate": 1.82063270264095e-05, + "loss": 2.6981, + "step": 6136500 + }, + { + "epoch": 1.907775810695917, + "grad_norm": 9.009601593017578, + "learning_rate": 1.8203736488401384e-05, + "loss": 2.6758, + "step": 6137000 + }, + { + "epoch": 1.907931242976404, + "grad_norm": 9.184388160705566, + "learning_rate": 1.8201145950393268e-05, + "loss": 2.7776, + "step": 6137500 + }, + { + "epoch": 1.9080866752568908, + "grad_norm": 12.113730430603027, + "learning_rate": 1.8198555412385155e-05, + "loss": 2.6678, + "step": 6138000 + }, + { + "epoch": 1.9082421075373777, + "grad_norm": 9.923501014709473, + "learning_rate": 1.8195964874377043e-05, + "loss": 2.6909, + "step": 6138500 + }, + { + "epoch": 1.9083975398178645, + "grad_norm": 12.989973068237305, + "learning_rate": 1.8193374336368926e-05, + "loss": 2.6851, + "step": 6139000 + }, + { + "epoch": 1.9085529720983514, + "grad_norm": 6.367912292480469, + "learning_rate": 1.819078379836081e-05, + "loss": 2.7134, + "step": 6139500 + }, + { + "epoch": 1.9087084043788383, + "grad_norm": 17.129701614379883, + "learning_rate": 1.8188193260352697e-05, + "loss": 2.6903, + "step": 6140000 + }, + { + "epoch": 1.9088638366593251, + "grad_norm": 9.92507553100586, + "learning_rate": 1.8185602722344584e-05, + "loss": 2.7185, + "step": 6140500 + }, + { + "epoch": 1.909019268939812, + "grad_norm": 8.806078910827637, + "learning_rate": 1.8183012184336468e-05, + "loss": 2.7141, + "step": 6141000 + }, + { + "epoch": 1.9091747012202989, + "grad_norm": 9.805106163024902, + "learning_rate": 1.8180421646328355e-05, + "loss": 2.7339, + "step": 6141500 + }, + { + "epoch": 1.9093301335007857, + "grad_norm": 9.159728050231934, + "learning_rate": 1.817783110832024e-05, + "loss": 2.7446, + "step": 6142000 + }, + { + "epoch": 1.9094855657812726, + "grad_norm": 8.947589874267578, + "learning_rate": 1.8175240570312123e-05, + "loss": 2.6834, + "step": 6142500 + }, + { + "epoch": 1.9096409980617595, + "grad_norm": 8.79433536529541, + "learning_rate": 1.817265003230401e-05, + "loss": 2.7107, + "step": 6143000 + }, + { + "epoch": 1.9097964303422463, + "grad_norm": 9.457640647888184, + "learning_rate": 1.8170059494295897e-05, + "loss": 2.6992, + "step": 6143500 + }, + { + "epoch": 1.9099518626227332, + "grad_norm": 9.77224349975586, + "learning_rate": 1.816746895628778e-05, + "loss": 2.7289, + "step": 6144000 + }, + { + "epoch": 1.91010729490322, + "grad_norm": 20.31692123413086, + "learning_rate": 1.8164878418279665e-05, + "loss": 2.677, + "step": 6144500 + }, + { + "epoch": 1.910262727183707, + "grad_norm": 9.811423301696777, + "learning_rate": 1.8162287880271552e-05, + "loss": 2.7374, + "step": 6145000 + }, + { + "epoch": 1.9104181594641938, + "grad_norm": 9.697639465332031, + "learning_rate": 1.815969734226344e-05, + "loss": 2.767, + "step": 6145500 + }, + { + "epoch": 1.9105735917446807, + "grad_norm": 12.549519538879395, + "learning_rate": 1.8157106804255323e-05, + "loss": 2.7473, + "step": 6146000 + }, + { + "epoch": 1.9107290240251675, + "grad_norm": 16.147850036621094, + "learning_rate": 1.8154516266247206e-05, + "loss": 2.6469, + "step": 6146500 + }, + { + "epoch": 1.9108844563056544, + "grad_norm": 8.527677536010742, + "learning_rate": 1.8151925728239094e-05, + "loss": 2.7816, + "step": 6147000 + }, + { + "epoch": 1.9110398885861413, + "grad_norm": 13.854696273803711, + "learning_rate": 1.8149335190230977e-05, + "loss": 2.7117, + "step": 6147500 + }, + { + "epoch": 1.9111953208666281, + "grad_norm": 39.55266189575195, + "learning_rate": 1.8146744652222865e-05, + "loss": 2.705, + "step": 6148000 + }, + { + "epoch": 1.911350753147115, + "grad_norm": 10.395376205444336, + "learning_rate": 1.814415411421475e-05, + "loss": 2.7114, + "step": 6148500 + }, + { + "epoch": 1.9115061854276019, + "grad_norm": 8.251696586608887, + "learning_rate": 1.8141563576206635e-05, + "loss": 2.6936, + "step": 6149000 + }, + { + "epoch": 1.9116616177080887, + "grad_norm": 8.121703147888184, + "learning_rate": 1.813897303819852e-05, + "loss": 2.71, + "step": 6149500 + }, + { + "epoch": 1.9118170499885756, + "grad_norm": 11.84521198272705, + "learning_rate": 1.8136382500190406e-05, + "loss": 2.6978, + "step": 6150000 + }, + { + "epoch": 1.9119724822690625, + "grad_norm": 10.7095308303833, + "learning_rate": 1.8133791962182294e-05, + "loss": 2.735, + "step": 6150500 + }, + { + "epoch": 1.9121279145495493, + "grad_norm": 10.25289249420166, + "learning_rate": 1.8131201424174177e-05, + "loss": 2.7231, + "step": 6151000 + }, + { + "epoch": 1.9122833468300362, + "grad_norm": 14.038287162780762, + "learning_rate": 1.812861088616606e-05, + "loss": 2.7415, + "step": 6151500 + }, + { + "epoch": 1.912438779110523, + "grad_norm": 8.37179183959961, + "learning_rate": 1.8126020348157945e-05, + "loss": 2.7223, + "step": 6152000 + }, + { + "epoch": 1.9125942113910102, + "grad_norm": 11.273056983947754, + "learning_rate": 1.8123429810149832e-05, + "loss": 2.7114, + "step": 6152500 + }, + { + "epoch": 1.912749643671497, + "grad_norm": 8.24425220489502, + "learning_rate": 1.812083927214172e-05, + "loss": 2.6822, + "step": 6153000 + }, + { + "epoch": 1.912905075951984, + "grad_norm": 8.40710735321045, + "learning_rate": 1.8118248734133603e-05, + "loss": 2.6967, + "step": 6153500 + }, + { + "epoch": 1.9130605082324708, + "grad_norm": 9.805397987365723, + "learning_rate": 1.8115658196125487e-05, + "loss": 2.709, + "step": 6154000 + }, + { + "epoch": 1.9132159405129576, + "grad_norm": 13.282853126525879, + "learning_rate": 1.8113067658117374e-05, + "loss": 2.678, + "step": 6154500 + }, + { + "epoch": 1.9133713727934445, + "grad_norm": 11.254938125610352, + "learning_rate": 1.811047712010926e-05, + "loss": 2.6718, + "step": 6155000 + }, + { + "epoch": 1.9135268050739314, + "grad_norm": 13.445643424987793, + "learning_rate": 1.8107886582101145e-05, + "loss": 2.7101, + "step": 6155500 + }, + { + "epoch": 1.9136822373544182, + "grad_norm": 8.7559175491333, + "learning_rate": 1.8105296044093032e-05, + "loss": 2.7049, + "step": 6156000 + }, + { + "epoch": 1.913837669634905, + "grad_norm": 31.85071563720703, + "learning_rate": 1.8102705506084916e-05, + "loss": 2.7035, + "step": 6156500 + }, + { + "epoch": 1.913993101915392, + "grad_norm": 10.356575012207031, + "learning_rate": 1.81001149680768e-05, + "loss": 2.6925, + "step": 6157000 + }, + { + "epoch": 1.9141485341958788, + "grad_norm": 11.16580581665039, + "learning_rate": 1.8097524430068687e-05, + "loss": 2.6096, + "step": 6157500 + }, + { + "epoch": 1.9143039664763657, + "grad_norm": 11.385037422180176, + "learning_rate": 1.8094933892060574e-05, + "loss": 2.7136, + "step": 6158000 + }, + { + "epoch": 1.9144593987568528, + "grad_norm": 11.689997673034668, + "learning_rate": 1.8092343354052458e-05, + "loss": 2.7189, + "step": 6158500 + }, + { + "epoch": 1.9146148310373396, + "grad_norm": 9.19793701171875, + "learning_rate": 1.808975281604434e-05, + "loss": 2.7473, + "step": 6159000 + }, + { + "epoch": 1.9147702633178265, + "grad_norm": 11.352249145507812, + "learning_rate": 1.808716227803623e-05, + "loss": 2.7348, + "step": 6159500 + }, + { + "epoch": 1.9149256955983134, + "grad_norm": 11.271684646606445, + "learning_rate": 1.8084571740028116e-05, + "loss": 2.693, + "step": 6160000 + }, + { + "epoch": 1.9150811278788002, + "grad_norm": 9.717806816101074, + "learning_rate": 1.808198120202e-05, + "loss": 2.6752, + "step": 6160500 + }, + { + "epoch": 1.9152365601592871, + "grad_norm": 10.550542831420898, + "learning_rate": 1.8079390664011883e-05, + "loss": 2.6505, + "step": 6161000 + }, + { + "epoch": 1.915391992439774, + "grad_norm": 10.28568172454834, + "learning_rate": 1.807680012600377e-05, + "loss": 2.7311, + "step": 6161500 + }, + { + "epoch": 1.9155474247202608, + "grad_norm": 6.375062465667725, + "learning_rate": 1.8074209587995654e-05, + "loss": 2.7298, + "step": 6162000 + }, + { + "epoch": 1.9157028570007477, + "grad_norm": 9.555851936340332, + "learning_rate": 1.807161904998754e-05, + "loss": 2.7118, + "step": 6162500 + }, + { + "epoch": 1.9158582892812346, + "grad_norm": 10.543843269348145, + "learning_rate": 1.8069028511979425e-05, + "loss": 2.6832, + "step": 6163000 + }, + { + "epoch": 1.9160137215617215, + "grad_norm": 12.365208625793457, + "learning_rate": 1.8066437973971312e-05, + "loss": 2.7163, + "step": 6163500 + }, + { + "epoch": 1.9161691538422083, + "grad_norm": 9.461843490600586, + "learning_rate": 1.8063847435963196e-05, + "loss": 2.6826, + "step": 6164000 + }, + { + "epoch": 1.9163245861226952, + "grad_norm": 9.133349418640137, + "learning_rate": 1.806125689795508e-05, + "loss": 2.7135, + "step": 6164500 + }, + { + "epoch": 1.916480018403182, + "grad_norm": 8.165904998779297, + "learning_rate": 1.805866635994697e-05, + "loss": 2.7355, + "step": 6165000 + }, + { + "epoch": 1.916635450683669, + "grad_norm": 8.060606956481934, + "learning_rate": 1.8056075821938854e-05, + "loss": 2.7365, + "step": 6165500 + }, + { + "epoch": 1.9167908829641558, + "grad_norm": 9.650263786315918, + "learning_rate": 1.8053485283930738e-05, + "loss": 2.6854, + "step": 6166000 + }, + { + "epoch": 1.9169463152446427, + "grad_norm": 14.810920715332031, + "learning_rate": 1.805089474592262e-05, + "loss": 2.725, + "step": 6166500 + }, + { + "epoch": 1.9171017475251295, + "grad_norm": 9.71377182006836, + "learning_rate": 1.804830420791451e-05, + "loss": 2.6925, + "step": 6167000 + }, + { + "epoch": 1.9172571798056164, + "grad_norm": 11.724621772766113, + "learning_rate": 1.8045713669906396e-05, + "loss": 2.6627, + "step": 6167500 + }, + { + "epoch": 1.9174126120861033, + "grad_norm": 13.911215782165527, + "learning_rate": 1.804312313189828e-05, + "loss": 2.7375, + "step": 6168000 + }, + { + "epoch": 1.9175680443665901, + "grad_norm": 10.437297821044922, + "learning_rate": 1.8040532593890167e-05, + "loss": 2.7497, + "step": 6168500 + }, + { + "epoch": 1.917723476647077, + "grad_norm": 17.910110473632812, + "learning_rate": 1.803794205588205e-05, + "loss": 2.7243, + "step": 6169000 + }, + { + "epoch": 1.9178789089275639, + "grad_norm": 9.900843620300293, + "learning_rate": 1.8035351517873934e-05, + "loss": 2.6681, + "step": 6169500 + }, + { + "epoch": 1.9180343412080507, + "grad_norm": 8.050731658935547, + "learning_rate": 1.803276097986582e-05, + "loss": 2.6479, + "step": 6170000 + }, + { + "epoch": 1.9181897734885376, + "grad_norm": 10.742048263549805, + "learning_rate": 1.803017044185771e-05, + "loss": 2.6678, + "step": 6170500 + }, + { + "epoch": 1.9183452057690245, + "grad_norm": 15.575956344604492, + "learning_rate": 1.8027579903849592e-05, + "loss": 2.7243, + "step": 6171000 + }, + { + "epoch": 1.9185006380495113, + "grad_norm": 9.037358283996582, + "learning_rate": 1.8024989365841476e-05, + "loss": 2.7423, + "step": 6171500 + }, + { + "epoch": 1.9186560703299982, + "grad_norm": 38.038692474365234, + "learning_rate": 1.8022398827833363e-05, + "loss": 2.6988, + "step": 6172000 + }, + { + "epoch": 1.918811502610485, + "grad_norm": 11.902853965759277, + "learning_rate": 1.801980828982525e-05, + "loss": 2.7604, + "step": 6172500 + }, + { + "epoch": 1.918966934890972, + "grad_norm": 6.965473651885986, + "learning_rate": 1.8017217751817134e-05, + "loss": 2.7121, + "step": 6173000 + }, + { + "epoch": 1.9191223671714588, + "grad_norm": 10.538763999938965, + "learning_rate": 1.8014627213809018e-05, + "loss": 2.7206, + "step": 6173500 + }, + { + "epoch": 1.9192777994519457, + "grad_norm": 21.69645118713379, + "learning_rate": 1.8012036675800905e-05, + "loss": 2.7301, + "step": 6174000 + }, + { + "epoch": 1.9194332317324325, + "grad_norm": 8.2157621383667, + "learning_rate": 1.800944613779279e-05, + "loss": 2.7354, + "step": 6174500 + }, + { + "epoch": 1.9195886640129194, + "grad_norm": 9.772622108459473, + "learning_rate": 1.8006855599784676e-05, + "loss": 2.7067, + "step": 6175000 + }, + { + "epoch": 1.9197440962934063, + "grad_norm": 42.181114196777344, + "learning_rate": 1.800426506177656e-05, + "loss": 2.6622, + "step": 6175500 + }, + { + "epoch": 1.9198995285738931, + "grad_norm": 13.63506031036377, + "learning_rate": 1.8001674523768447e-05, + "loss": 2.7241, + "step": 6176000 + }, + { + "epoch": 1.9200549608543802, + "grad_norm": 12.47251033782959, + "learning_rate": 1.799908398576033e-05, + "loss": 2.7541, + "step": 6176500 + }, + { + "epoch": 1.920210393134867, + "grad_norm": 12.135584831237793, + "learning_rate": 1.7996493447752218e-05, + "loss": 2.677, + "step": 6177000 + }, + { + "epoch": 1.920365825415354, + "grad_norm": 7.983353137969971, + "learning_rate": 1.7993902909744105e-05, + "loss": 2.7142, + "step": 6177500 + }, + { + "epoch": 1.9205212576958408, + "grad_norm": 9.782539367675781, + "learning_rate": 1.799131237173599e-05, + "loss": 2.7316, + "step": 6178000 + }, + { + "epoch": 1.9206766899763277, + "grad_norm": 11.499107360839844, + "learning_rate": 1.7988721833727872e-05, + "loss": 2.7192, + "step": 6178500 + }, + { + "epoch": 1.9208321222568145, + "grad_norm": 9.588883399963379, + "learning_rate": 1.7986131295719756e-05, + "loss": 2.6567, + "step": 6179000 + }, + { + "epoch": 1.9209875545373014, + "grad_norm": 9.548892974853516, + "learning_rate": 1.7983540757711643e-05, + "loss": 2.7389, + "step": 6179500 + }, + { + "epoch": 1.9211429868177883, + "grad_norm": 7.722056865692139, + "learning_rate": 1.798095021970353e-05, + "loss": 2.7147, + "step": 6180000 + }, + { + "epoch": 1.9212984190982751, + "grad_norm": 9.759925842285156, + "learning_rate": 1.7978359681695414e-05, + "loss": 2.7227, + "step": 6180500 + }, + { + "epoch": 1.921453851378762, + "grad_norm": 12.234682083129883, + "learning_rate": 1.7975769143687298e-05, + "loss": 2.7103, + "step": 6181000 + }, + { + "epoch": 1.9216092836592489, + "grad_norm": 13.711362838745117, + "learning_rate": 1.7973178605679185e-05, + "loss": 2.7315, + "step": 6181500 + }, + { + "epoch": 1.9217647159397357, + "grad_norm": 10.317355155944824, + "learning_rate": 1.7970588067671072e-05, + "loss": 2.6928, + "step": 6182000 + }, + { + "epoch": 1.9219201482202228, + "grad_norm": 10.547563552856445, + "learning_rate": 1.7967997529662956e-05, + "loss": 2.704, + "step": 6182500 + }, + { + "epoch": 1.9220755805007097, + "grad_norm": 11.8016996383667, + "learning_rate": 1.7965406991654843e-05, + "loss": 2.6733, + "step": 6183000 + }, + { + "epoch": 1.9222310127811966, + "grad_norm": 35.717891693115234, + "learning_rate": 1.7962816453646727e-05, + "loss": 2.7205, + "step": 6183500 + }, + { + "epoch": 1.9223864450616834, + "grad_norm": 11.073837280273438, + "learning_rate": 1.796022591563861e-05, + "loss": 2.7286, + "step": 6184000 + }, + { + "epoch": 1.9225418773421703, + "grad_norm": 12.049833297729492, + "learning_rate": 1.7957635377630498e-05, + "loss": 2.7668, + "step": 6184500 + }, + { + "epoch": 1.9226973096226572, + "grad_norm": 35.81043243408203, + "learning_rate": 1.7955044839622385e-05, + "loss": 2.6832, + "step": 6185000 + }, + { + "epoch": 1.922852741903144, + "grad_norm": 8.28368091583252, + "learning_rate": 1.795245430161427e-05, + "loss": 2.7021, + "step": 6185500 + }, + { + "epoch": 1.923008174183631, + "grad_norm": 10.161111831665039, + "learning_rate": 1.7949863763606153e-05, + "loss": 2.6916, + "step": 6186000 + }, + { + "epoch": 1.9231636064641178, + "grad_norm": 9.273707389831543, + "learning_rate": 1.794727322559804e-05, + "loss": 2.6724, + "step": 6186500 + }, + { + "epoch": 1.9233190387446046, + "grad_norm": 11.767848014831543, + "learning_rate": 1.7944682687589927e-05, + "loss": 2.7078, + "step": 6187000 + }, + { + "epoch": 1.9234744710250915, + "grad_norm": 5.917243957519531, + "learning_rate": 1.794209214958181e-05, + "loss": 2.6862, + "step": 6187500 + }, + { + "epoch": 1.9236299033055784, + "grad_norm": 11.3074951171875, + "learning_rate": 1.7939501611573695e-05, + "loss": 2.7034, + "step": 6188000 + }, + { + "epoch": 1.9237853355860652, + "grad_norm": 9.367467880249023, + "learning_rate": 1.793691107356558e-05, + "loss": 2.7265, + "step": 6188500 + }, + { + "epoch": 1.923940767866552, + "grad_norm": 5.351495265960693, + "learning_rate": 1.7934320535557465e-05, + "loss": 2.6463, + "step": 6189000 + }, + { + "epoch": 1.924096200147039, + "grad_norm": 7.607372283935547, + "learning_rate": 1.7931729997549353e-05, + "loss": 2.6805, + "step": 6189500 + }, + { + "epoch": 1.9242516324275258, + "grad_norm": 8.349042892456055, + "learning_rate": 1.7929139459541236e-05, + "loss": 2.7319, + "step": 6190000 + }, + { + "epoch": 1.9244070647080127, + "grad_norm": 8.268037796020508, + "learning_rate": 1.7926548921533124e-05, + "loss": 2.7129, + "step": 6190500 + }, + { + "epoch": 1.9245624969884996, + "grad_norm": 11.817957878112793, + "learning_rate": 1.7923958383525007e-05, + "loss": 2.6801, + "step": 6191000 + }, + { + "epoch": 1.9247179292689864, + "grad_norm": 10.20536994934082, + "learning_rate": 1.792136784551689e-05, + "loss": 2.7199, + "step": 6191500 + }, + { + "epoch": 1.9248733615494733, + "grad_norm": 8.489242553710938, + "learning_rate": 1.791877730750878e-05, + "loss": 2.7091, + "step": 6192000 + }, + { + "epoch": 1.9250287938299602, + "grad_norm": 12.511687278747559, + "learning_rate": 1.7916186769500665e-05, + "loss": 2.6879, + "step": 6192500 + }, + { + "epoch": 1.925184226110447, + "grad_norm": 10.56583023071289, + "learning_rate": 1.791359623149255e-05, + "loss": 2.6789, + "step": 6193000 + }, + { + "epoch": 1.925339658390934, + "grad_norm": 10.893620491027832, + "learning_rate": 1.7911005693484433e-05, + "loss": 2.7026, + "step": 6193500 + }, + { + "epoch": 1.9254950906714208, + "grad_norm": 9.481050491333008, + "learning_rate": 1.790841515547632e-05, + "loss": 2.7325, + "step": 6194000 + }, + { + "epoch": 1.9256505229519076, + "grad_norm": 8.249889373779297, + "learning_rate": 1.7905824617468207e-05, + "loss": 2.6563, + "step": 6194500 + }, + { + "epoch": 1.9258059552323945, + "grad_norm": 9.350749015808105, + "learning_rate": 1.790323407946009e-05, + "loss": 2.6875, + "step": 6195000 + }, + { + "epoch": 1.9259613875128814, + "grad_norm": 7.619637966156006, + "learning_rate": 1.7900643541451978e-05, + "loss": 2.6985, + "step": 6195500 + }, + { + "epoch": 1.9261168197933682, + "grad_norm": 12.689906120300293, + "learning_rate": 1.7898053003443862e-05, + "loss": 2.6815, + "step": 6196000 + }, + { + "epoch": 1.926272252073855, + "grad_norm": 7.945117950439453, + "learning_rate": 1.7895462465435746e-05, + "loss": 2.6995, + "step": 6196500 + }, + { + "epoch": 1.926427684354342, + "grad_norm": 12.5842924118042, + "learning_rate": 1.7892871927427633e-05, + "loss": 2.7099, + "step": 6197000 + }, + { + "epoch": 1.9265831166348288, + "grad_norm": 11.607348442077637, + "learning_rate": 1.789028138941952e-05, + "loss": 2.672, + "step": 6197500 + }, + { + "epoch": 1.9267385489153157, + "grad_norm": 8.997671127319336, + "learning_rate": 1.7887690851411404e-05, + "loss": 2.7084, + "step": 6198000 + }, + { + "epoch": 1.9268939811958026, + "grad_norm": 14.713113784790039, + "learning_rate": 1.7885100313403287e-05, + "loss": 2.6778, + "step": 6198500 + }, + { + "epoch": 1.9270494134762894, + "grad_norm": 9.969120025634766, + "learning_rate": 1.7882509775395175e-05, + "loss": 2.6803, + "step": 6199000 + }, + { + "epoch": 1.9272048457567763, + "grad_norm": 9.574060440063477, + "learning_rate": 1.7879919237387062e-05, + "loss": 2.6636, + "step": 6199500 + }, + { + "epoch": 1.9273602780372632, + "grad_norm": 10.881086349487305, + "learning_rate": 1.7877328699378946e-05, + "loss": 2.6866, + "step": 6200000 + }, + { + "epoch": 1.9275157103177503, + "grad_norm": 9.6753511428833, + "learning_rate": 1.787473816137083e-05, + "loss": 2.705, + "step": 6200500 + }, + { + "epoch": 1.9276711425982371, + "grad_norm": 8.109087944030762, + "learning_rate": 1.7872147623362716e-05, + "loss": 2.7191, + "step": 6201000 + }, + { + "epoch": 1.927826574878724, + "grad_norm": 9.331068992614746, + "learning_rate": 1.78695570853546e-05, + "loss": 2.7125, + "step": 6201500 + }, + { + "epoch": 1.9279820071592109, + "grad_norm": 10.683977127075195, + "learning_rate": 1.7866966547346487e-05, + "loss": 2.685, + "step": 6202000 + }, + { + "epoch": 1.9281374394396977, + "grad_norm": 11.199057579040527, + "learning_rate": 1.786437600933837e-05, + "loss": 2.709, + "step": 6202500 + }, + { + "epoch": 1.9282928717201846, + "grad_norm": 9.413895606994629, + "learning_rate": 1.7861785471330258e-05, + "loss": 2.6732, + "step": 6203000 + }, + { + "epoch": 1.9284483040006715, + "grad_norm": 14.704536437988281, + "learning_rate": 1.7859194933322142e-05, + "loss": 2.685, + "step": 6203500 + }, + { + "epoch": 1.9286037362811583, + "grad_norm": 9.289810180664062, + "learning_rate": 1.785660439531403e-05, + "loss": 2.7406, + "step": 6204000 + }, + { + "epoch": 1.9287591685616452, + "grad_norm": 16.68156623840332, + "learning_rate": 1.7854013857305916e-05, + "loss": 2.6831, + "step": 6204500 + }, + { + "epoch": 1.928914600842132, + "grad_norm": 11.383519172668457, + "learning_rate": 1.78514233192978e-05, + "loss": 2.7041, + "step": 6205000 + }, + { + "epoch": 1.929070033122619, + "grad_norm": 5.713280200958252, + "learning_rate": 1.7848832781289684e-05, + "loss": 2.7106, + "step": 6205500 + }, + { + "epoch": 1.9292254654031058, + "grad_norm": 9.833961486816406, + "learning_rate": 1.7846242243281568e-05, + "loss": 2.7158, + "step": 6206000 + }, + { + "epoch": 1.9293808976835929, + "grad_norm": 8.297346115112305, + "learning_rate": 1.7843651705273455e-05, + "loss": 2.6824, + "step": 6206500 + }, + { + "epoch": 1.9295363299640798, + "grad_norm": 11.791733741760254, + "learning_rate": 1.7841061167265342e-05, + "loss": 2.6695, + "step": 6207000 + }, + { + "epoch": 1.9296917622445666, + "grad_norm": 10.168438911437988, + "learning_rate": 1.7838470629257226e-05, + "loss": 2.7059, + "step": 6207500 + }, + { + "epoch": 1.9298471945250535, + "grad_norm": 9.26523208618164, + "learning_rate": 1.783588009124911e-05, + "loss": 2.6945, + "step": 6208000 + }, + { + "epoch": 1.9300026268055404, + "grad_norm": 8.316652297973633, + "learning_rate": 1.7833289553240997e-05, + "loss": 2.7471, + "step": 6208500 + }, + { + "epoch": 1.9301580590860272, + "grad_norm": 8.70187759399414, + "learning_rate": 1.7830699015232884e-05, + "loss": 2.7125, + "step": 6209000 + }, + { + "epoch": 1.930313491366514, + "grad_norm": 9.277782440185547, + "learning_rate": 1.7828108477224768e-05, + "loss": 2.6758, + "step": 6209500 + }, + { + "epoch": 1.930468923647001, + "grad_norm": 12.539257049560547, + "learning_rate": 1.7825517939216655e-05, + "loss": 2.7286, + "step": 6210000 + }, + { + "epoch": 1.9306243559274878, + "grad_norm": 13.1458158493042, + "learning_rate": 1.782292740120854e-05, + "loss": 2.7073, + "step": 6210500 + }, + { + "epoch": 1.9307797882079747, + "grad_norm": 10.568598747253418, + "learning_rate": 1.7820336863200422e-05, + "loss": 2.7121, + "step": 6211000 + }, + { + "epoch": 1.9309352204884616, + "grad_norm": 8.765340805053711, + "learning_rate": 1.781774632519231e-05, + "loss": 2.7332, + "step": 6211500 + }, + { + "epoch": 1.9310906527689484, + "grad_norm": 10.486903190612793, + "learning_rate": 1.7815155787184197e-05, + "loss": 2.6955, + "step": 6212000 + }, + { + "epoch": 1.9312460850494353, + "grad_norm": 8.380946159362793, + "learning_rate": 1.781256524917608e-05, + "loss": 2.6776, + "step": 6212500 + }, + { + "epoch": 1.9314015173299222, + "grad_norm": 10.08552074432373, + "learning_rate": 1.7809974711167964e-05, + "loss": 2.7259, + "step": 6213000 + }, + { + "epoch": 1.931556949610409, + "grad_norm": 7.827059745788574, + "learning_rate": 1.780738417315985e-05, + "loss": 2.7274, + "step": 6213500 + }, + { + "epoch": 1.931712381890896, + "grad_norm": 8.024733543395996, + "learning_rate": 1.780479363515174e-05, + "loss": 2.6765, + "step": 6214000 + }, + { + "epoch": 1.9318678141713828, + "grad_norm": 9.164281845092773, + "learning_rate": 1.7802203097143622e-05, + "loss": 2.6945, + "step": 6214500 + }, + { + "epoch": 1.9320232464518696, + "grad_norm": 8.643006324768066, + "learning_rate": 1.7799612559135506e-05, + "loss": 2.695, + "step": 6215000 + }, + { + "epoch": 1.9321786787323565, + "grad_norm": 10.581880569458008, + "learning_rate": 1.7797022021127393e-05, + "loss": 2.6868, + "step": 6215500 + }, + { + "epoch": 1.9323341110128434, + "grad_norm": 10.387292861938477, + "learning_rate": 1.7794431483119277e-05, + "loss": 2.7011, + "step": 6216000 + }, + { + "epoch": 1.9324895432933302, + "grad_norm": 9.523636817932129, + "learning_rate": 1.7791840945111164e-05, + "loss": 2.7283, + "step": 6216500 + }, + { + "epoch": 1.932644975573817, + "grad_norm": 10.875844955444336, + "learning_rate": 1.7789250407103048e-05, + "loss": 2.7566, + "step": 6217000 + }, + { + "epoch": 1.932800407854304, + "grad_norm": 8.496195793151855, + "learning_rate": 1.7786659869094935e-05, + "loss": 2.6868, + "step": 6217500 + }, + { + "epoch": 1.9329558401347908, + "grad_norm": 52.0619010925293, + "learning_rate": 1.778406933108682e-05, + "loss": 2.6924, + "step": 6218000 + }, + { + "epoch": 1.9331112724152777, + "grad_norm": 8.549470901489258, + "learning_rate": 1.7781478793078702e-05, + "loss": 2.6912, + "step": 6218500 + }, + { + "epoch": 1.9332667046957646, + "grad_norm": 9.723100662231445, + "learning_rate": 1.7778888255070593e-05, + "loss": 2.7302, + "step": 6219000 + }, + { + "epoch": 1.9334221369762514, + "grad_norm": 8.918662071228027, + "learning_rate": 1.7776297717062477e-05, + "loss": 2.6646, + "step": 6219500 + }, + { + "epoch": 1.9335775692567383, + "grad_norm": 10.067307472229004, + "learning_rate": 1.777370717905436e-05, + "loss": 2.7079, + "step": 6220000 + }, + { + "epoch": 1.9337330015372252, + "grad_norm": 12.992559432983398, + "learning_rate": 1.7771116641046244e-05, + "loss": 2.7533, + "step": 6220500 + }, + { + "epoch": 1.933888433817712, + "grad_norm": 9.367100715637207, + "learning_rate": 1.776852610303813e-05, + "loss": 2.7371, + "step": 6221000 + }, + { + "epoch": 1.934043866098199, + "grad_norm": 11.327390670776367, + "learning_rate": 1.776593556503002e-05, + "loss": 2.7161, + "step": 6221500 + }, + { + "epoch": 1.9341992983786858, + "grad_norm": 9.60624885559082, + "learning_rate": 1.7763345027021902e-05, + "loss": 2.7282, + "step": 6222000 + }, + { + "epoch": 1.9343547306591726, + "grad_norm": 26.997631072998047, + "learning_rate": 1.776075448901379e-05, + "loss": 2.7287, + "step": 6222500 + }, + { + "epoch": 1.9345101629396595, + "grad_norm": 9.850542068481445, + "learning_rate": 1.7758163951005673e-05, + "loss": 2.6458, + "step": 6223000 + }, + { + "epoch": 1.9346655952201464, + "grad_norm": 9.553865432739258, + "learning_rate": 1.7755573412997557e-05, + "loss": 2.6974, + "step": 6223500 + }, + { + "epoch": 1.9348210275006332, + "grad_norm": 9.250406265258789, + "learning_rate": 1.7752982874989444e-05, + "loss": 2.6792, + "step": 6224000 + }, + { + "epoch": 1.9349764597811203, + "grad_norm": 12.36746597290039, + "learning_rate": 1.775039233698133e-05, + "loss": 2.691, + "step": 6224500 + }, + { + "epoch": 1.9351318920616072, + "grad_norm": 31.88622283935547, + "learning_rate": 1.7747801798973215e-05, + "loss": 2.7404, + "step": 6225000 + }, + { + "epoch": 1.935287324342094, + "grad_norm": 10.459936141967773, + "learning_rate": 1.77452112609651e-05, + "loss": 2.6973, + "step": 6225500 + }, + { + "epoch": 1.935442756622581, + "grad_norm": 9.273307800292969, + "learning_rate": 1.7742620722956986e-05, + "loss": 2.6853, + "step": 6226000 + }, + { + "epoch": 1.9355981889030678, + "grad_norm": 7.561222076416016, + "learning_rate": 1.7740030184948873e-05, + "loss": 2.6747, + "step": 6226500 + }, + { + "epoch": 1.9357536211835547, + "grad_norm": 9.541339874267578, + "learning_rate": 1.7737439646940757e-05, + "loss": 2.6329, + "step": 6227000 + }, + { + "epoch": 1.9359090534640415, + "grad_norm": 11.313865661621094, + "learning_rate": 1.773484910893264e-05, + "loss": 2.7302, + "step": 6227500 + }, + { + "epoch": 1.9360644857445284, + "grad_norm": 16.183263778686523, + "learning_rate": 1.7732258570924528e-05, + "loss": 2.6958, + "step": 6228000 + }, + { + "epoch": 1.9362199180250153, + "grad_norm": 14.827424049377441, + "learning_rate": 1.772966803291641e-05, + "loss": 2.6917, + "step": 6228500 + }, + { + "epoch": 1.9363753503055021, + "grad_norm": 12.171751976013184, + "learning_rate": 1.77270774949083e-05, + "loss": 2.7201, + "step": 6229000 + }, + { + "epoch": 1.936530782585989, + "grad_norm": 10.101222038269043, + "learning_rate": 1.7724486956900183e-05, + "loss": 2.6906, + "step": 6229500 + }, + { + "epoch": 1.9366862148664759, + "grad_norm": 21.32511329650879, + "learning_rate": 1.772189641889207e-05, + "loss": 2.7115, + "step": 6230000 + }, + { + "epoch": 1.936841647146963, + "grad_norm": 10.145933151245117, + "learning_rate": 1.7719305880883953e-05, + "loss": 2.7037, + "step": 6230500 + }, + { + "epoch": 1.9369970794274498, + "grad_norm": 8.878311157226562, + "learning_rate": 1.771671534287584e-05, + "loss": 2.6649, + "step": 6231000 + }, + { + "epoch": 1.9371525117079367, + "grad_norm": 8.325425148010254, + "learning_rate": 1.7714124804867728e-05, + "loss": 2.6789, + "step": 6231500 + }, + { + "epoch": 1.9373079439884235, + "grad_norm": 7.8091630935668945, + "learning_rate": 1.771153426685961e-05, + "loss": 2.7078, + "step": 6232000 + }, + { + "epoch": 1.9374633762689104, + "grad_norm": 12.197896957397461, + "learning_rate": 1.7708943728851495e-05, + "loss": 2.6909, + "step": 6232500 + }, + { + "epoch": 1.9376188085493973, + "grad_norm": 9.302379608154297, + "learning_rate": 1.770635319084338e-05, + "loss": 2.6974, + "step": 6233000 + }, + { + "epoch": 1.9377742408298841, + "grad_norm": 10.383015632629395, + "learning_rate": 1.7703762652835266e-05, + "loss": 2.7504, + "step": 6233500 + }, + { + "epoch": 1.937929673110371, + "grad_norm": 10.566961288452148, + "learning_rate": 1.7701172114827153e-05, + "loss": 2.6941, + "step": 6234000 + }, + { + "epoch": 1.9380851053908579, + "grad_norm": 9.089461326599121, + "learning_rate": 1.7698581576819037e-05, + "loss": 2.7165, + "step": 6234500 + }, + { + "epoch": 1.9382405376713447, + "grad_norm": 26.427959442138672, + "learning_rate": 1.769599103881092e-05, + "loss": 2.7758, + "step": 6235000 + }, + { + "epoch": 1.9383959699518316, + "grad_norm": 14.977781295776367, + "learning_rate": 1.7693400500802808e-05, + "loss": 2.7605, + "step": 6235500 + }, + { + "epoch": 1.9385514022323185, + "grad_norm": 10.531403541564941, + "learning_rate": 1.7690809962794695e-05, + "loss": 2.6599, + "step": 6236000 + }, + { + "epoch": 1.9387068345128053, + "grad_norm": 11.498234748840332, + "learning_rate": 1.768821942478658e-05, + "loss": 2.703, + "step": 6236500 + }, + { + "epoch": 1.9388622667932922, + "grad_norm": 9.390480041503906, + "learning_rate": 1.7685628886778466e-05, + "loss": 2.6732, + "step": 6237000 + }, + { + "epoch": 1.939017699073779, + "grad_norm": 7.730078220367432, + "learning_rate": 1.768303834877035e-05, + "loss": 2.7029, + "step": 6237500 + }, + { + "epoch": 1.939173131354266, + "grad_norm": 9.46361255645752, + "learning_rate": 1.7680447810762234e-05, + "loss": 2.703, + "step": 6238000 + }, + { + "epoch": 1.9393285636347528, + "grad_norm": 13.427980422973633, + "learning_rate": 1.767785727275412e-05, + "loss": 2.7157, + "step": 6238500 + }, + { + "epoch": 1.9394839959152397, + "grad_norm": 12.626065254211426, + "learning_rate": 1.7675266734746008e-05, + "loss": 2.7369, + "step": 6239000 + }, + { + "epoch": 1.9396394281957265, + "grad_norm": 8.402342796325684, + "learning_rate": 1.7672676196737892e-05, + "loss": 2.6532, + "step": 6239500 + }, + { + "epoch": 1.9397948604762134, + "grad_norm": 9.327301025390625, + "learning_rate": 1.7670085658729776e-05, + "loss": 2.7018, + "step": 6240000 + }, + { + "epoch": 1.9399502927567003, + "grad_norm": 11.273341178894043, + "learning_rate": 1.7667495120721663e-05, + "loss": 2.6796, + "step": 6240500 + }, + { + "epoch": 1.9401057250371871, + "grad_norm": 10.193717002868652, + "learning_rate": 1.766490458271355e-05, + "loss": 2.7198, + "step": 6241000 + }, + { + "epoch": 1.940261157317674, + "grad_norm": 8.355236053466797, + "learning_rate": 1.7662314044705434e-05, + "loss": 2.7335, + "step": 6241500 + }, + { + "epoch": 1.9404165895981609, + "grad_norm": 8.667320251464844, + "learning_rate": 1.7659723506697317e-05, + "loss": 2.6855, + "step": 6242000 + }, + { + "epoch": 1.9405720218786477, + "grad_norm": 11.730514526367188, + "learning_rate": 1.7657132968689205e-05, + "loss": 2.673, + "step": 6242500 + }, + { + "epoch": 1.9407274541591346, + "grad_norm": 9.451279640197754, + "learning_rate": 1.7654542430681088e-05, + "loss": 2.7311, + "step": 6243000 + }, + { + "epoch": 1.9408828864396215, + "grad_norm": 13.13267993927002, + "learning_rate": 1.7651951892672975e-05, + "loss": 2.7368, + "step": 6243500 + }, + { + "epoch": 1.9410383187201083, + "grad_norm": 11.148510932922363, + "learning_rate": 1.7649361354664863e-05, + "loss": 2.6612, + "step": 6244000 + }, + { + "epoch": 1.9411937510005952, + "grad_norm": 10.5660982131958, + "learning_rate": 1.7646770816656746e-05, + "loss": 2.7332, + "step": 6244500 + }, + { + "epoch": 1.941349183281082, + "grad_norm": 11.041346549987793, + "learning_rate": 1.764418027864863e-05, + "loss": 2.6878, + "step": 6245000 + }, + { + "epoch": 1.941504615561569, + "grad_norm": 12.296360969543457, + "learning_rate": 1.7641589740640517e-05, + "loss": 2.704, + "step": 6245500 + }, + { + "epoch": 1.9416600478420558, + "grad_norm": 9.870649337768555, + "learning_rate": 1.7638999202632404e-05, + "loss": 2.6978, + "step": 6246000 + }, + { + "epoch": 1.9418154801225427, + "grad_norm": 25.203285217285156, + "learning_rate": 1.7636408664624288e-05, + "loss": 2.7123, + "step": 6246500 + }, + { + "epoch": 1.9419709124030295, + "grad_norm": 10.734140396118164, + "learning_rate": 1.7633818126616172e-05, + "loss": 2.694, + "step": 6247000 + }, + { + "epoch": 1.9421263446835164, + "grad_norm": 5.983595848083496, + "learning_rate": 1.7631227588608056e-05, + "loss": 2.6761, + "step": 6247500 + }, + { + "epoch": 1.9422817769640033, + "grad_norm": 10.946915626525879, + "learning_rate": 1.7628637050599943e-05, + "loss": 2.6635, + "step": 6248000 + }, + { + "epoch": 1.9424372092444901, + "grad_norm": 14.519962310791016, + "learning_rate": 1.762604651259183e-05, + "loss": 2.6915, + "step": 6248500 + }, + { + "epoch": 1.9425926415249772, + "grad_norm": 13.332839012145996, + "learning_rate": 1.7623455974583714e-05, + "loss": 2.6541, + "step": 6249000 + }, + { + "epoch": 1.942748073805464, + "grad_norm": 9.831883430480957, + "learning_rate": 1.76208654365756e-05, + "loss": 2.6715, + "step": 6249500 + }, + { + "epoch": 1.942903506085951, + "grad_norm": 11.252925872802734, + "learning_rate": 1.7618274898567485e-05, + "loss": 2.6316, + "step": 6250000 + }, + { + "epoch": 1.9430589383664378, + "grad_norm": 9.075813293457031, + "learning_rate": 1.7615684360559372e-05, + "loss": 2.6844, + "step": 6250500 + }, + { + "epoch": 1.9432143706469247, + "grad_norm": 10.388398170471191, + "learning_rate": 1.7613093822551256e-05, + "loss": 2.7156, + "step": 6251000 + }, + { + "epoch": 1.9433698029274116, + "grad_norm": 82.93697357177734, + "learning_rate": 1.7610503284543143e-05, + "loss": 2.7288, + "step": 6251500 + }, + { + "epoch": 1.9435252352078984, + "grad_norm": 8.914904594421387, + "learning_rate": 1.7607912746535027e-05, + "loss": 2.733, + "step": 6252000 + }, + { + "epoch": 1.9436806674883853, + "grad_norm": 7.78976583480835, + "learning_rate": 1.760532220852691e-05, + "loss": 2.7084, + "step": 6252500 + }, + { + "epoch": 1.9438360997688722, + "grad_norm": 26.231340408325195, + "learning_rate": 1.7602731670518797e-05, + "loss": 2.7393, + "step": 6253000 + }, + { + "epoch": 1.943991532049359, + "grad_norm": 12.74790096282959, + "learning_rate": 1.7600141132510685e-05, + "loss": 2.6896, + "step": 6253500 + }, + { + "epoch": 1.944146964329846, + "grad_norm": 9.668807983398438, + "learning_rate": 1.759755059450257e-05, + "loss": 2.7142, + "step": 6254000 + }, + { + "epoch": 1.9443023966103328, + "grad_norm": 10.318914413452148, + "learning_rate": 1.7594960056494452e-05, + "loss": 2.7224, + "step": 6254500 + }, + { + "epoch": 1.9444578288908199, + "grad_norm": 10.10628890991211, + "learning_rate": 1.759236951848634e-05, + "loss": 2.6847, + "step": 6255000 + }, + { + "epoch": 1.9446132611713067, + "grad_norm": 7.420587539672852, + "learning_rate": 1.7589778980478226e-05, + "loss": 2.6897, + "step": 6255500 + }, + { + "epoch": 1.9447686934517936, + "grad_norm": 8.224776268005371, + "learning_rate": 1.758718844247011e-05, + "loss": 2.7164, + "step": 6256000 + }, + { + "epoch": 1.9449241257322805, + "grad_norm": 24.24579429626465, + "learning_rate": 1.7584597904461994e-05, + "loss": 2.6762, + "step": 6256500 + }, + { + "epoch": 1.9450795580127673, + "grad_norm": 9.14384651184082, + "learning_rate": 1.758200736645388e-05, + "loss": 2.7018, + "step": 6257000 + }, + { + "epoch": 1.9452349902932542, + "grad_norm": 11.390775680541992, + "learning_rate": 1.7579416828445765e-05, + "loss": 2.7404, + "step": 6257500 + }, + { + "epoch": 1.945390422573741, + "grad_norm": 9.320452690124512, + "learning_rate": 1.7576826290437652e-05, + "loss": 2.6831, + "step": 6258000 + }, + { + "epoch": 1.945545854854228, + "grad_norm": 8.517974853515625, + "learning_rate": 1.757423575242954e-05, + "loss": 2.7316, + "step": 6258500 + }, + { + "epoch": 1.9457012871347148, + "grad_norm": 8.980229377746582, + "learning_rate": 1.7571645214421423e-05, + "loss": 2.7072, + "step": 6259000 + }, + { + "epoch": 1.9458567194152017, + "grad_norm": 11.717243194580078, + "learning_rate": 1.7569054676413307e-05, + "loss": 2.6424, + "step": 6259500 + }, + { + "epoch": 1.9460121516956885, + "grad_norm": 12.068193435668945, + "learning_rate": 1.756646413840519e-05, + "loss": 2.6742, + "step": 6260000 + }, + { + "epoch": 1.9461675839761754, + "grad_norm": 12.43643856048584, + "learning_rate": 1.756387360039708e-05, + "loss": 2.7493, + "step": 6260500 + }, + { + "epoch": 1.9463230162566623, + "grad_norm": 10.549778938293457, + "learning_rate": 1.7561283062388965e-05, + "loss": 2.7207, + "step": 6261000 + }, + { + "epoch": 1.9464784485371491, + "grad_norm": 13.023324966430664, + "learning_rate": 1.755869252438085e-05, + "loss": 2.7474, + "step": 6261500 + }, + { + "epoch": 1.946633880817636, + "grad_norm": 18.445024490356445, + "learning_rate": 1.7556101986372736e-05, + "loss": 2.7147, + "step": 6262000 + }, + { + "epoch": 1.9467893130981229, + "grad_norm": 7.9459381103515625, + "learning_rate": 1.755351144836462e-05, + "loss": 2.6875, + "step": 6262500 + }, + { + "epoch": 1.9469447453786097, + "grad_norm": 9.973637580871582, + "learning_rate": 1.7550920910356507e-05, + "loss": 2.7267, + "step": 6263000 + }, + { + "epoch": 1.9471001776590966, + "grad_norm": 41.4423828125, + "learning_rate": 1.754833037234839e-05, + "loss": 2.7092, + "step": 6263500 + }, + { + "epoch": 1.9472556099395835, + "grad_norm": 9.353131294250488, + "learning_rate": 1.7545739834340278e-05, + "loss": 2.7547, + "step": 6264000 + }, + { + "epoch": 1.9474110422200703, + "grad_norm": 10.361119270324707, + "learning_rate": 1.754314929633216e-05, + "loss": 2.6999, + "step": 6264500 + }, + { + "epoch": 1.9475664745005572, + "grad_norm": 11.019140243530273, + "learning_rate": 1.7540558758324045e-05, + "loss": 2.6797, + "step": 6265000 + }, + { + "epoch": 1.947721906781044, + "grad_norm": 9.981464385986328, + "learning_rate": 1.7537968220315932e-05, + "loss": 2.7147, + "step": 6265500 + }, + { + "epoch": 1.947877339061531, + "grad_norm": 10.296342849731445, + "learning_rate": 1.753537768230782e-05, + "loss": 2.7023, + "step": 6266000 + }, + { + "epoch": 1.9480327713420178, + "grad_norm": 8.523398399353027, + "learning_rate": 1.7532787144299703e-05, + "loss": 2.7594, + "step": 6266500 + }, + { + "epoch": 1.9481882036225047, + "grad_norm": 9.39046859741211, + "learning_rate": 1.7530196606291587e-05, + "loss": 2.6875, + "step": 6267000 + }, + { + "epoch": 1.9483436359029915, + "grad_norm": 9.009559631347656, + "learning_rate": 1.7527606068283474e-05, + "loss": 2.6971, + "step": 6267500 + }, + { + "epoch": 1.9484990681834784, + "grad_norm": 11.321907043457031, + "learning_rate": 1.752501553027536e-05, + "loss": 2.7343, + "step": 6268000 + }, + { + "epoch": 1.9486545004639653, + "grad_norm": 7.537430286407471, + "learning_rate": 1.7522424992267245e-05, + "loss": 2.6931, + "step": 6268500 + }, + { + "epoch": 1.9488099327444521, + "grad_norm": 8.807741165161133, + "learning_rate": 1.751983445425913e-05, + "loss": 2.6855, + "step": 6269000 + }, + { + "epoch": 1.948965365024939, + "grad_norm": 19.931901931762695, + "learning_rate": 1.7517243916251016e-05, + "loss": 2.7359, + "step": 6269500 + }, + { + "epoch": 1.9491207973054259, + "grad_norm": 11.290868759155273, + "learning_rate": 1.75146533782429e-05, + "loss": 2.7069, + "step": 6270000 + }, + { + "epoch": 1.9492762295859127, + "grad_norm": 10.288359642028809, + "learning_rate": 1.7512062840234787e-05, + "loss": 2.7209, + "step": 6270500 + }, + { + "epoch": 1.9494316618663996, + "grad_norm": 11.959267616271973, + "learning_rate": 1.7509472302226674e-05, + "loss": 2.6858, + "step": 6271000 + }, + { + "epoch": 1.9495870941468865, + "grad_norm": 10.99095344543457, + "learning_rate": 1.7506881764218558e-05, + "loss": 2.7221, + "step": 6271500 + }, + { + "epoch": 1.9497425264273733, + "grad_norm": 9.430435180664062, + "learning_rate": 1.750429122621044e-05, + "loss": 2.7247, + "step": 6272000 + }, + { + "epoch": 1.9498979587078602, + "grad_norm": 9.99356460571289, + "learning_rate": 1.750170068820233e-05, + "loss": 2.6562, + "step": 6272500 + }, + { + "epoch": 1.9500533909883473, + "grad_norm": 8.945643424987793, + "learning_rate": 1.7499110150194216e-05, + "loss": 2.7236, + "step": 6273000 + }, + { + "epoch": 1.9502088232688342, + "grad_norm": 9.717695236206055, + "learning_rate": 1.74965196121861e-05, + "loss": 2.7135, + "step": 6273500 + }, + { + "epoch": 1.950364255549321, + "grad_norm": 10.170516967773438, + "learning_rate": 1.7493929074177983e-05, + "loss": 2.7264, + "step": 6274000 + }, + { + "epoch": 1.950519687829808, + "grad_norm": 8.00455093383789, + "learning_rate": 1.7491338536169867e-05, + "loss": 2.718, + "step": 6274500 + }, + { + "epoch": 1.9506751201102948, + "grad_norm": 17.105566024780273, + "learning_rate": 1.7488747998161754e-05, + "loss": 2.6671, + "step": 6275000 + }, + { + "epoch": 1.9508305523907816, + "grad_norm": 7.929605960845947, + "learning_rate": 1.748615746015364e-05, + "loss": 2.671, + "step": 6275500 + }, + { + "epoch": 1.9509859846712685, + "grad_norm": 8.384932518005371, + "learning_rate": 1.7483566922145525e-05, + "loss": 2.7672, + "step": 6276000 + }, + { + "epoch": 1.9511414169517554, + "grad_norm": 9.041644096374512, + "learning_rate": 1.7480976384137412e-05, + "loss": 2.7368, + "step": 6276500 + }, + { + "epoch": 1.9512968492322422, + "grad_norm": 8.400248527526855, + "learning_rate": 1.7478385846129296e-05, + "loss": 2.6928, + "step": 6277000 + }, + { + "epoch": 1.951452281512729, + "grad_norm": 8.966097831726074, + "learning_rate": 1.7475795308121183e-05, + "loss": 2.7161, + "step": 6277500 + }, + { + "epoch": 1.951607713793216, + "grad_norm": 16.07844352722168, + "learning_rate": 1.7473204770113067e-05, + "loss": 2.6723, + "step": 6278000 + }, + { + "epoch": 1.9517631460737028, + "grad_norm": 7.373655796051025, + "learning_rate": 1.7470614232104954e-05, + "loss": 2.7244, + "step": 6278500 + }, + { + "epoch": 1.95191857835419, + "grad_norm": 9.455856323242188, + "learning_rate": 1.7468023694096838e-05, + "loss": 2.6663, + "step": 6279000 + }, + { + "epoch": 1.9520740106346768, + "grad_norm": 9.764810562133789, + "learning_rate": 1.7465433156088722e-05, + "loss": 2.7234, + "step": 6279500 + }, + { + "epoch": 1.9522294429151636, + "grad_norm": 8.73656940460205, + "learning_rate": 1.746284261808061e-05, + "loss": 2.7231, + "step": 6280000 + }, + { + "epoch": 1.9523848751956505, + "grad_norm": 13.602486610412598, + "learning_rate": 1.7460252080072496e-05, + "loss": 2.7112, + "step": 6280500 + }, + { + "epoch": 1.9525403074761374, + "grad_norm": 8.266012191772461, + "learning_rate": 1.745766154206438e-05, + "loss": 2.69, + "step": 6281000 + }, + { + "epoch": 1.9526957397566242, + "grad_norm": 8.910715103149414, + "learning_rate": 1.7455071004056264e-05, + "loss": 2.6793, + "step": 6281500 + }, + { + "epoch": 1.9528511720371111, + "grad_norm": 15.084343910217285, + "learning_rate": 1.745248046604815e-05, + "loss": 2.6877, + "step": 6282000 + }, + { + "epoch": 1.953006604317598, + "grad_norm": 10.115996360778809, + "learning_rate": 1.7449889928040038e-05, + "loss": 2.6687, + "step": 6282500 + }, + { + "epoch": 1.9531620365980848, + "grad_norm": 22.743738174438477, + "learning_rate": 1.744729939003192e-05, + "loss": 2.6682, + "step": 6283000 + }, + { + "epoch": 1.9533174688785717, + "grad_norm": 19.963212966918945, + "learning_rate": 1.7444708852023805e-05, + "loss": 2.7604, + "step": 6283500 + }, + { + "epoch": 1.9534729011590586, + "grad_norm": 16.209733963012695, + "learning_rate": 1.7442118314015693e-05, + "loss": 2.6934, + "step": 6284000 + }, + { + "epoch": 1.9536283334395455, + "grad_norm": 25.527549743652344, + "learning_rate": 1.7439527776007576e-05, + "loss": 2.6678, + "step": 6284500 + }, + { + "epoch": 1.9537837657200323, + "grad_norm": 10.088175773620605, + "learning_rate": 1.7436937237999463e-05, + "loss": 2.7171, + "step": 6285000 + }, + { + "epoch": 1.9539391980005192, + "grad_norm": 8.003850936889648, + "learning_rate": 1.743434669999135e-05, + "loss": 2.7071, + "step": 6285500 + }, + { + "epoch": 1.954094630281006, + "grad_norm": 8.614705085754395, + "learning_rate": 1.7431756161983234e-05, + "loss": 2.6833, + "step": 6286000 + }, + { + "epoch": 1.954250062561493, + "grad_norm": 12.36047649383545, + "learning_rate": 1.7429165623975118e-05, + "loss": 2.6893, + "step": 6286500 + }, + { + "epoch": 1.9544054948419798, + "grad_norm": 10.21396255493164, + "learning_rate": 1.7426575085967002e-05, + "loss": 2.7177, + "step": 6287000 + }, + { + "epoch": 1.9545609271224667, + "grad_norm": 34.71122360229492, + "learning_rate": 1.7423984547958892e-05, + "loss": 2.7029, + "step": 6287500 + }, + { + "epoch": 1.9547163594029535, + "grad_norm": 10.778740882873535, + "learning_rate": 1.7421394009950776e-05, + "loss": 2.7049, + "step": 6288000 + }, + { + "epoch": 1.9548717916834404, + "grad_norm": 8.501799583435059, + "learning_rate": 1.741880347194266e-05, + "loss": 2.7086, + "step": 6288500 + }, + { + "epoch": 1.9550272239639273, + "grad_norm": 9.08053970336914, + "learning_rate": 1.7416212933934547e-05, + "loss": 2.666, + "step": 6289000 + }, + { + "epoch": 1.9551826562444141, + "grad_norm": 9.176589012145996, + "learning_rate": 1.741362239592643e-05, + "loss": 2.6879, + "step": 6289500 + }, + { + "epoch": 1.955338088524901, + "grad_norm": 12.177523612976074, + "learning_rate": 1.7411031857918318e-05, + "loss": 2.6594, + "step": 6290000 + }, + { + "epoch": 1.9554935208053879, + "grad_norm": 10.025749206542969, + "learning_rate": 1.7408441319910202e-05, + "loss": 2.6935, + "step": 6290500 + }, + { + "epoch": 1.9556489530858747, + "grad_norm": 11.071272850036621, + "learning_rate": 1.740585078190209e-05, + "loss": 2.6891, + "step": 6291000 + }, + { + "epoch": 1.9558043853663616, + "grad_norm": 13.210013389587402, + "learning_rate": 1.7403260243893973e-05, + "loss": 2.7142, + "step": 6291500 + }, + { + "epoch": 1.9559598176468485, + "grad_norm": 11.849190711975098, + "learning_rate": 1.7400669705885857e-05, + "loss": 2.7192, + "step": 6292000 + }, + { + "epoch": 1.9561152499273353, + "grad_norm": 8.894112586975098, + "learning_rate": 1.7398079167877744e-05, + "loss": 2.7217, + "step": 6292500 + }, + { + "epoch": 1.9562706822078222, + "grad_norm": 20.152812957763672, + "learning_rate": 1.739548862986963e-05, + "loss": 2.6933, + "step": 6293000 + }, + { + "epoch": 1.956426114488309, + "grad_norm": 8.871316909790039, + "learning_rate": 1.7392898091861515e-05, + "loss": 2.7087, + "step": 6293500 + }, + { + "epoch": 1.956581546768796, + "grad_norm": 8.726814270019531, + "learning_rate": 1.73903075538534e-05, + "loss": 2.6546, + "step": 6294000 + }, + { + "epoch": 1.9567369790492828, + "grad_norm": 17.03890037536621, + "learning_rate": 1.7387717015845285e-05, + "loss": 2.7051, + "step": 6294500 + }, + { + "epoch": 1.9568924113297697, + "grad_norm": 8.354791641235352, + "learning_rate": 1.7385126477837173e-05, + "loss": 2.722, + "step": 6295000 + }, + { + "epoch": 1.9570478436102565, + "grad_norm": 26.148632049560547, + "learning_rate": 1.7382535939829056e-05, + "loss": 2.7012, + "step": 6295500 + }, + { + "epoch": 1.9572032758907434, + "grad_norm": 11.385266304016113, + "learning_rate": 1.737994540182094e-05, + "loss": 2.7339, + "step": 6296000 + }, + { + "epoch": 1.9573587081712303, + "grad_norm": 10.034164428710938, + "learning_rate": 1.7377354863812827e-05, + "loss": 2.7252, + "step": 6296500 + }, + { + "epoch": 1.9575141404517173, + "grad_norm": 8.497214317321777, + "learning_rate": 1.737476432580471e-05, + "loss": 2.6923, + "step": 6297000 + }, + { + "epoch": 1.9576695727322042, + "grad_norm": 10.960000991821289, + "learning_rate": 1.7372173787796598e-05, + "loss": 2.7326, + "step": 6297500 + }, + { + "epoch": 1.957825005012691, + "grad_norm": 13.866413116455078, + "learning_rate": 1.7369583249788485e-05, + "loss": 2.7203, + "step": 6298000 + }, + { + "epoch": 1.957980437293178, + "grad_norm": 13.657150268554688, + "learning_rate": 1.736699271178037e-05, + "loss": 2.6767, + "step": 6298500 + }, + { + "epoch": 1.9581358695736648, + "grad_norm": 8.370461463928223, + "learning_rate": 1.7364402173772253e-05, + "loss": 2.7098, + "step": 6299000 + }, + { + "epoch": 1.9582913018541517, + "grad_norm": 10.06187629699707, + "learning_rate": 1.736181163576414e-05, + "loss": 2.7431, + "step": 6299500 + }, + { + "epoch": 1.9584467341346385, + "grad_norm": 28.316543579101562, + "learning_rate": 1.7359221097756027e-05, + "loss": 2.6961, + "step": 6300000 + }, + { + "epoch": 1.9586021664151254, + "grad_norm": 9.594482421875, + "learning_rate": 1.735663055974791e-05, + "loss": 2.6992, + "step": 6300500 + }, + { + "epoch": 1.9587575986956123, + "grad_norm": 10.198009490966797, + "learning_rate": 1.7354040021739795e-05, + "loss": 2.69, + "step": 6301000 + }, + { + "epoch": 1.9589130309760991, + "grad_norm": 11.132537841796875, + "learning_rate": 1.735144948373168e-05, + "loss": 2.7467, + "step": 6301500 + }, + { + "epoch": 1.959068463256586, + "grad_norm": 9.071274757385254, + "learning_rate": 1.7348858945723566e-05, + "loss": 2.6801, + "step": 6302000 + }, + { + "epoch": 1.9592238955370729, + "grad_norm": 9.506178855895996, + "learning_rate": 1.7346268407715453e-05, + "loss": 2.6928, + "step": 6302500 + }, + { + "epoch": 1.95937932781756, + "grad_norm": 10.286179542541504, + "learning_rate": 1.7343677869707337e-05, + "loss": 2.6778, + "step": 6303000 + }, + { + "epoch": 1.9595347600980468, + "grad_norm": 8.994146347045898, + "learning_rate": 1.7341087331699224e-05, + "loss": 2.6926, + "step": 6303500 + }, + { + "epoch": 1.9596901923785337, + "grad_norm": 9.411577224731445, + "learning_rate": 1.7338496793691108e-05, + "loss": 2.7168, + "step": 6304000 + }, + { + "epoch": 1.9598456246590206, + "grad_norm": 10.510900497436523, + "learning_rate": 1.7335906255682995e-05, + "loss": 2.6624, + "step": 6304500 + }, + { + "epoch": 1.9600010569395074, + "grad_norm": 10.519540786743164, + "learning_rate": 1.733331571767488e-05, + "loss": 2.7458, + "step": 6305000 + }, + { + "epoch": 1.9601564892199943, + "grad_norm": 8.269734382629395, + "learning_rate": 1.7330725179666766e-05, + "loss": 2.682, + "step": 6305500 + }, + { + "epoch": 1.9603119215004812, + "grad_norm": 9.142173767089844, + "learning_rate": 1.732813464165865e-05, + "loss": 2.7127, + "step": 6306000 + }, + { + "epoch": 1.960467353780968, + "grad_norm": 10.683876991271973, + "learning_rate": 1.7325544103650533e-05, + "loss": 2.7579, + "step": 6306500 + }, + { + "epoch": 1.960622786061455, + "grad_norm": 6.316325664520264, + "learning_rate": 1.732295356564242e-05, + "loss": 2.7481, + "step": 6307000 + }, + { + "epoch": 1.9607782183419418, + "grad_norm": 8.415277481079102, + "learning_rate": 1.7320363027634307e-05, + "loss": 2.6847, + "step": 6307500 + }, + { + "epoch": 1.9609336506224286, + "grad_norm": 10.819091796875, + "learning_rate": 1.731777248962619e-05, + "loss": 2.6789, + "step": 6308000 + }, + { + "epoch": 1.9610890829029155, + "grad_norm": 10.991046905517578, + "learning_rate": 1.7315181951618075e-05, + "loss": 2.718, + "step": 6308500 + }, + { + "epoch": 1.9612445151834024, + "grad_norm": 9.224801063537598, + "learning_rate": 1.7312591413609962e-05, + "loss": 2.7647, + "step": 6309000 + }, + { + "epoch": 1.9613999474638892, + "grad_norm": 12.009142875671387, + "learning_rate": 1.731000087560185e-05, + "loss": 2.7038, + "step": 6309500 + }, + { + "epoch": 1.961555379744376, + "grad_norm": 15.023393630981445, + "learning_rate": 1.7307410337593733e-05, + "loss": 2.7158, + "step": 6310000 + }, + { + "epoch": 1.961710812024863, + "grad_norm": 8.492396354675293, + "learning_rate": 1.7304819799585617e-05, + "loss": 2.6479, + "step": 6310500 + }, + { + "epoch": 1.9618662443053498, + "grad_norm": 7.869043827056885, + "learning_rate": 1.7302229261577504e-05, + "loss": 2.7355, + "step": 6311000 + }, + { + "epoch": 1.9620216765858367, + "grad_norm": 10.545600891113281, + "learning_rate": 1.7299638723569388e-05, + "loss": 2.6859, + "step": 6311500 + }, + { + "epoch": 1.9621771088663236, + "grad_norm": 15.430359840393066, + "learning_rate": 1.7297048185561275e-05, + "loss": 2.6674, + "step": 6312000 + }, + { + "epoch": 1.9623325411468104, + "grad_norm": 8.679728507995605, + "learning_rate": 1.7294457647553162e-05, + "loss": 2.7195, + "step": 6312500 + }, + { + "epoch": 1.9624879734272973, + "grad_norm": 11.885601043701172, + "learning_rate": 1.7291867109545046e-05, + "loss": 2.7479, + "step": 6313000 + }, + { + "epoch": 1.9626434057077842, + "grad_norm": 8.6301851272583, + "learning_rate": 1.728927657153693e-05, + "loss": 2.704, + "step": 6313500 + }, + { + "epoch": 1.962798837988271, + "grad_norm": 9.897290229797363, + "learning_rate": 1.7286686033528813e-05, + "loss": 2.688, + "step": 6314000 + }, + { + "epoch": 1.962954270268758, + "grad_norm": 9.223912239074707, + "learning_rate": 1.7284095495520704e-05, + "loss": 2.7615, + "step": 6314500 + }, + { + "epoch": 1.9631097025492448, + "grad_norm": 9.893311500549316, + "learning_rate": 1.7281504957512588e-05, + "loss": 2.6814, + "step": 6315000 + }, + { + "epoch": 1.9632651348297316, + "grad_norm": 9.93043041229248, + "learning_rate": 1.727891441950447e-05, + "loss": 2.737, + "step": 6315500 + }, + { + "epoch": 1.9634205671102185, + "grad_norm": 11.269104957580566, + "learning_rate": 1.727632388149636e-05, + "loss": 2.6791, + "step": 6316000 + }, + { + "epoch": 1.9635759993907054, + "grad_norm": 17.714115142822266, + "learning_rate": 1.7273733343488242e-05, + "loss": 2.7505, + "step": 6316500 + }, + { + "epoch": 1.9637314316711922, + "grad_norm": 10.416509628295898, + "learning_rate": 1.727114280548013e-05, + "loss": 2.6771, + "step": 6317000 + }, + { + "epoch": 1.963886863951679, + "grad_norm": 9.182862281799316, + "learning_rate": 1.7268552267472013e-05, + "loss": 2.6953, + "step": 6317500 + }, + { + "epoch": 1.964042296232166, + "grad_norm": 18.95886993408203, + "learning_rate": 1.72659617294639e-05, + "loss": 2.6714, + "step": 6318000 + }, + { + "epoch": 1.9641977285126528, + "grad_norm": 24.038135528564453, + "learning_rate": 1.7263371191455784e-05, + "loss": 2.7417, + "step": 6318500 + }, + { + "epoch": 1.9643531607931397, + "grad_norm": 8.40233325958252, + "learning_rate": 1.7260780653447668e-05, + "loss": 2.7153, + "step": 6319000 + }, + { + "epoch": 1.9645085930736266, + "grad_norm": 11.604591369628906, + "learning_rate": 1.7258190115439555e-05, + "loss": 2.7288, + "step": 6319500 + }, + { + "epoch": 1.9646640253541134, + "grad_norm": 9.450324058532715, + "learning_rate": 1.7255599577431442e-05, + "loss": 2.7283, + "step": 6320000 + }, + { + "epoch": 1.9648194576346003, + "grad_norm": 10.304622650146484, + "learning_rate": 1.7253009039423326e-05, + "loss": 2.7062, + "step": 6320500 + }, + { + "epoch": 1.9649748899150874, + "grad_norm": 9.0465669631958, + "learning_rate": 1.725041850141521e-05, + "loss": 2.6892, + "step": 6321000 + }, + { + "epoch": 1.9651303221955743, + "grad_norm": 20.67472267150879, + "learning_rate": 1.7247827963407097e-05, + "loss": 2.711, + "step": 6321500 + }, + { + "epoch": 1.9652857544760611, + "grad_norm": 9.374152183532715, + "learning_rate": 1.7245237425398984e-05, + "loss": 2.7129, + "step": 6322000 + }, + { + "epoch": 1.965441186756548, + "grad_norm": 8.711250305175781, + "learning_rate": 1.7242646887390868e-05, + "loss": 2.7142, + "step": 6322500 + }, + { + "epoch": 1.9655966190370349, + "grad_norm": 13.491230964660645, + "learning_rate": 1.724005634938275e-05, + "loss": 2.7279, + "step": 6323000 + }, + { + "epoch": 1.9657520513175217, + "grad_norm": 10.245499610900879, + "learning_rate": 1.723746581137464e-05, + "loss": 2.6688, + "step": 6323500 + }, + { + "epoch": 1.9659074835980086, + "grad_norm": 8.620443344116211, + "learning_rate": 1.7234875273366523e-05, + "loss": 2.7022, + "step": 6324000 + }, + { + "epoch": 1.9660629158784955, + "grad_norm": 7.800626754760742, + "learning_rate": 1.723228473535841e-05, + "loss": 2.754, + "step": 6324500 + }, + { + "epoch": 1.9662183481589823, + "grad_norm": 8.278101921081543, + "learning_rate": 1.7229694197350297e-05, + "loss": 2.7323, + "step": 6325000 + }, + { + "epoch": 1.9663737804394692, + "grad_norm": 11.597506523132324, + "learning_rate": 1.722710365934218e-05, + "loss": 2.709, + "step": 6325500 + }, + { + "epoch": 1.966529212719956, + "grad_norm": 9.825881004333496, + "learning_rate": 1.7224513121334064e-05, + "loss": 2.7516, + "step": 6326000 + }, + { + "epoch": 1.966684645000443, + "grad_norm": 9.46397876739502, + "learning_rate": 1.722192258332595e-05, + "loss": 2.7293, + "step": 6326500 + }, + { + "epoch": 1.96684007728093, + "grad_norm": 15.32083511352539, + "learning_rate": 1.721933204531784e-05, + "loss": 2.7174, + "step": 6327000 + }, + { + "epoch": 1.9669955095614169, + "grad_norm": 8.730300903320312, + "learning_rate": 1.7216741507309722e-05, + "loss": 2.6914, + "step": 6327500 + }, + { + "epoch": 1.9671509418419038, + "grad_norm": 10.320018768310547, + "learning_rate": 1.7214150969301606e-05, + "loss": 2.7669, + "step": 6328000 + }, + { + "epoch": 1.9673063741223906, + "grad_norm": 10.520570755004883, + "learning_rate": 1.721156043129349e-05, + "loss": 2.6685, + "step": 6328500 + }, + { + "epoch": 1.9674618064028775, + "grad_norm": 7.847713947296143, + "learning_rate": 1.7208969893285377e-05, + "loss": 2.7003, + "step": 6329000 + }, + { + "epoch": 1.9676172386833644, + "grad_norm": 8.271740913391113, + "learning_rate": 1.7206379355277264e-05, + "loss": 2.7388, + "step": 6329500 + }, + { + "epoch": 1.9677726709638512, + "grad_norm": 11.199289321899414, + "learning_rate": 1.7203788817269148e-05, + "loss": 2.677, + "step": 6330000 + }, + { + "epoch": 1.967928103244338, + "grad_norm": 14.496686935424805, + "learning_rate": 1.7201198279261035e-05, + "loss": 2.6975, + "step": 6330500 + }, + { + "epoch": 1.968083535524825, + "grad_norm": 9.489480018615723, + "learning_rate": 1.719860774125292e-05, + "loss": 2.702, + "step": 6331000 + }, + { + "epoch": 1.9682389678053118, + "grad_norm": 11.57352352142334, + "learning_rate": 1.7196017203244806e-05, + "loss": 2.701, + "step": 6331500 + }, + { + "epoch": 1.9683944000857987, + "grad_norm": 10.949191093444824, + "learning_rate": 1.719342666523669e-05, + "loss": 2.7212, + "step": 6332000 + }, + { + "epoch": 1.9685498323662856, + "grad_norm": 10.271446228027344, + "learning_rate": 1.7190836127228577e-05, + "loss": 2.7141, + "step": 6332500 + }, + { + "epoch": 1.9687052646467724, + "grad_norm": 9.697540283203125, + "learning_rate": 1.718824558922046e-05, + "loss": 2.7165, + "step": 6333000 + }, + { + "epoch": 1.9688606969272593, + "grad_norm": 9.156517028808594, + "learning_rate": 1.7185655051212345e-05, + "loss": 2.7265, + "step": 6333500 + }, + { + "epoch": 1.9690161292077462, + "grad_norm": 11.146441459655762, + "learning_rate": 1.7183064513204232e-05, + "loss": 2.7014, + "step": 6334000 + }, + { + "epoch": 1.969171561488233, + "grad_norm": 8.774227142333984, + "learning_rate": 1.718047397519612e-05, + "loss": 2.7187, + "step": 6334500 + }, + { + "epoch": 1.96932699376872, + "grad_norm": 9.609609603881836, + "learning_rate": 1.7177883437188003e-05, + "loss": 2.7359, + "step": 6335000 + }, + { + "epoch": 1.9694824260492068, + "grad_norm": 15.197785377502441, + "learning_rate": 1.7175292899179886e-05, + "loss": 2.6547, + "step": 6335500 + }, + { + "epoch": 1.9696378583296936, + "grad_norm": 62.099632263183594, + "learning_rate": 1.7172702361171774e-05, + "loss": 2.6911, + "step": 6336000 + }, + { + "epoch": 1.9697932906101805, + "grad_norm": 8.91970157623291, + "learning_rate": 1.717011182316366e-05, + "loss": 2.6914, + "step": 6336500 + }, + { + "epoch": 1.9699487228906674, + "grad_norm": 25.63961410522461, + "learning_rate": 1.7167521285155544e-05, + "loss": 2.7408, + "step": 6337000 + }, + { + "epoch": 1.9701041551711542, + "grad_norm": 8.447606086730957, + "learning_rate": 1.7164930747147428e-05, + "loss": 2.6946, + "step": 6337500 + }, + { + "epoch": 1.970259587451641, + "grad_norm": 9.8547945022583, + "learning_rate": 1.7162340209139315e-05, + "loss": 2.6974, + "step": 6338000 + }, + { + "epoch": 1.970415019732128, + "grad_norm": 7.425563335418701, + "learning_rate": 1.71597496711312e-05, + "loss": 2.7358, + "step": 6338500 + }, + { + "epoch": 1.9705704520126148, + "grad_norm": 10.296028137207031, + "learning_rate": 1.7157159133123086e-05, + "loss": 2.7506, + "step": 6339000 + }, + { + "epoch": 1.9707258842931017, + "grad_norm": 10.913650512695312, + "learning_rate": 1.7154568595114973e-05, + "loss": 2.6769, + "step": 6339500 + }, + { + "epoch": 1.9708813165735886, + "grad_norm": 10.736574172973633, + "learning_rate": 1.7151978057106857e-05, + "loss": 2.7145, + "step": 6340000 + }, + { + "epoch": 1.9710367488540754, + "grad_norm": 9.326741218566895, + "learning_rate": 1.714938751909874e-05, + "loss": 2.7559, + "step": 6340500 + }, + { + "epoch": 1.9711921811345623, + "grad_norm": 10.529989242553711, + "learning_rate": 1.7146796981090628e-05, + "loss": 2.7158, + "step": 6341000 + }, + { + "epoch": 1.9713476134150492, + "grad_norm": 10.31421184539795, + "learning_rate": 1.7144206443082515e-05, + "loss": 2.6957, + "step": 6341500 + }, + { + "epoch": 1.971503045695536, + "grad_norm": 9.091107368469238, + "learning_rate": 1.71416159050744e-05, + "loss": 2.662, + "step": 6342000 + }, + { + "epoch": 1.971658477976023, + "grad_norm": 9.081587791442871, + "learning_rate": 1.7139025367066283e-05, + "loss": 2.6913, + "step": 6342500 + }, + { + "epoch": 1.9718139102565098, + "grad_norm": 12.450324058532715, + "learning_rate": 1.713643482905817e-05, + "loss": 2.7034, + "step": 6343000 + }, + { + "epoch": 1.9719693425369966, + "grad_norm": 9.020288467407227, + "learning_rate": 1.7133844291050054e-05, + "loss": 2.7423, + "step": 6343500 + }, + { + "epoch": 1.9721247748174835, + "grad_norm": 11.366331100463867, + "learning_rate": 1.713125375304194e-05, + "loss": 2.6915, + "step": 6344000 + }, + { + "epoch": 1.9722802070979704, + "grad_norm": 10.186678886413574, + "learning_rate": 1.7128663215033825e-05, + "loss": 2.6908, + "step": 6344500 + }, + { + "epoch": 1.9724356393784575, + "grad_norm": 10.815340995788574, + "learning_rate": 1.7126072677025712e-05, + "loss": 2.6971, + "step": 6345000 + }, + { + "epoch": 1.9725910716589443, + "grad_norm": 6.052585601806641, + "learning_rate": 1.7123482139017596e-05, + "loss": 2.7148, + "step": 6345500 + }, + { + "epoch": 1.9727465039394312, + "grad_norm": 7.9546799659729, + "learning_rate": 1.7120891601009483e-05, + "loss": 2.7419, + "step": 6346000 + }, + { + "epoch": 1.972901936219918, + "grad_norm": 17.898866653442383, + "learning_rate": 1.7118301063001366e-05, + "loss": 2.7189, + "step": 6346500 + }, + { + "epoch": 1.973057368500405, + "grad_norm": 10.379976272583008, + "learning_rate": 1.7115710524993254e-05, + "loss": 2.69, + "step": 6347000 + }, + { + "epoch": 1.9732128007808918, + "grad_norm": 9.676741600036621, + "learning_rate": 1.7113119986985137e-05, + "loss": 2.6825, + "step": 6347500 + }, + { + "epoch": 1.9733682330613787, + "grad_norm": 8.315817832946777, + "learning_rate": 1.711052944897702e-05, + "loss": 2.705, + "step": 6348000 + }, + { + "epoch": 1.9735236653418655, + "grad_norm": 9.401230812072754, + "learning_rate": 1.710793891096891e-05, + "loss": 2.6752, + "step": 6348500 + }, + { + "epoch": 1.9736790976223524, + "grad_norm": 8.434130668640137, + "learning_rate": 1.7105348372960795e-05, + "loss": 2.6725, + "step": 6349000 + }, + { + "epoch": 1.9738345299028393, + "grad_norm": 10.858369827270508, + "learning_rate": 1.710275783495268e-05, + "loss": 2.6969, + "step": 6349500 + }, + { + "epoch": 1.9739899621833261, + "grad_norm": 9.689377784729004, + "learning_rate": 1.7100167296944563e-05, + "loss": 2.7275, + "step": 6350000 + }, + { + "epoch": 1.974145394463813, + "grad_norm": 9.377406120300293, + "learning_rate": 1.709757675893645e-05, + "loss": 2.7027, + "step": 6350500 + }, + { + "epoch": 1.9743008267443, + "grad_norm": 7.863862991333008, + "learning_rate": 1.7094986220928337e-05, + "loss": 2.7455, + "step": 6351000 + }, + { + "epoch": 1.974456259024787, + "grad_norm": 9.379195213317871, + "learning_rate": 1.709239568292022e-05, + "loss": 2.7235, + "step": 6351500 + }, + { + "epoch": 1.9746116913052738, + "grad_norm": 11.383564949035645, + "learning_rate": 1.7089805144912108e-05, + "loss": 2.7325, + "step": 6352000 + }, + { + "epoch": 1.9747671235857607, + "grad_norm": 8.457605361938477, + "learning_rate": 1.7087214606903992e-05, + "loss": 2.7138, + "step": 6352500 + }, + { + "epoch": 1.9749225558662475, + "grad_norm": 9.115133285522461, + "learning_rate": 1.7084624068895876e-05, + "loss": 2.664, + "step": 6353000 + }, + { + "epoch": 1.9750779881467344, + "grad_norm": 7.6553144454956055, + "learning_rate": 1.7082033530887763e-05, + "loss": 2.7271, + "step": 6353500 + }, + { + "epoch": 1.9752334204272213, + "grad_norm": 21.137611389160156, + "learning_rate": 1.707944299287965e-05, + "loss": 2.7321, + "step": 6354000 + }, + { + "epoch": 1.9753888527077081, + "grad_norm": 8.768619537353516, + "learning_rate": 1.7076852454871534e-05, + "loss": 2.684, + "step": 6354500 + }, + { + "epoch": 1.975544284988195, + "grad_norm": 7.48311710357666, + "learning_rate": 1.7074261916863418e-05, + "loss": 2.7075, + "step": 6355000 + }, + { + "epoch": 1.9756997172686819, + "grad_norm": 22.284847259521484, + "learning_rate": 1.70716713788553e-05, + "loss": 2.6834, + "step": 6355500 + }, + { + "epoch": 1.9758551495491687, + "grad_norm": 8.71866512298584, + "learning_rate": 1.7069080840847192e-05, + "loss": 2.7412, + "step": 6356000 + }, + { + "epoch": 1.9760105818296556, + "grad_norm": 9.766355514526367, + "learning_rate": 1.7066490302839076e-05, + "loss": 2.7202, + "step": 6356500 + }, + { + "epoch": 1.9761660141101425, + "grad_norm": 74.4072265625, + "learning_rate": 1.706389976483096e-05, + "loss": 2.6839, + "step": 6357000 + }, + { + "epoch": 1.9763214463906293, + "grad_norm": 8.873008728027344, + "learning_rate": 1.7061309226822847e-05, + "loss": 2.6818, + "step": 6357500 + }, + { + "epoch": 1.9764768786711162, + "grad_norm": 9.245898246765137, + "learning_rate": 1.705871868881473e-05, + "loss": 2.7523, + "step": 6358000 + }, + { + "epoch": 1.976632310951603, + "grad_norm": 17.92935562133789, + "learning_rate": 1.7056128150806618e-05, + "loss": 2.7188, + "step": 6358500 + }, + { + "epoch": 1.97678774323209, + "grad_norm": 9.529455184936523, + "learning_rate": 1.70535376127985e-05, + "loss": 2.686, + "step": 6359000 + }, + { + "epoch": 1.9769431755125768, + "grad_norm": 9.584567070007324, + "learning_rate": 1.705094707479039e-05, + "loss": 2.6938, + "step": 6359500 + }, + { + "epoch": 1.9770986077930637, + "grad_norm": 18.481813430786133, + "learning_rate": 1.7048356536782272e-05, + "loss": 2.7312, + "step": 6360000 + }, + { + "epoch": 1.9772540400735505, + "grad_norm": 10.136115074157715, + "learning_rate": 1.7045765998774156e-05, + "loss": 2.7337, + "step": 6360500 + }, + { + "epoch": 1.9774094723540374, + "grad_norm": 17.868457794189453, + "learning_rate": 1.7043175460766046e-05, + "loss": 2.6472, + "step": 6361000 + }, + { + "epoch": 1.9775649046345243, + "grad_norm": 8.210644721984863, + "learning_rate": 1.704058492275793e-05, + "loss": 2.7034, + "step": 6361500 + }, + { + "epoch": 1.9777203369150111, + "grad_norm": 9.898244857788086, + "learning_rate": 1.7037994384749814e-05, + "loss": 2.6487, + "step": 6362000 + }, + { + "epoch": 1.977875769195498, + "grad_norm": 11.252701759338379, + "learning_rate": 1.7035403846741698e-05, + "loss": 2.7182, + "step": 6362500 + }, + { + "epoch": 1.9780312014759849, + "grad_norm": 9.241209983825684, + "learning_rate": 1.7032813308733585e-05, + "loss": 2.6743, + "step": 6363000 + }, + { + "epoch": 1.9781866337564717, + "grad_norm": 19.489927291870117, + "learning_rate": 1.7030222770725472e-05, + "loss": 2.7028, + "step": 6363500 + }, + { + "epoch": 1.9783420660369586, + "grad_norm": 9.18272590637207, + "learning_rate": 1.7027632232717356e-05, + "loss": 2.7218, + "step": 6364000 + }, + { + "epoch": 1.9784974983174455, + "grad_norm": 8.429157257080078, + "learning_rate": 1.702504169470924e-05, + "loss": 2.7142, + "step": 6364500 + }, + { + "epoch": 1.9786529305979323, + "grad_norm": 9.936046600341797, + "learning_rate": 1.7022451156701127e-05, + "loss": 2.7003, + "step": 6365000 + }, + { + "epoch": 1.9788083628784192, + "grad_norm": 10.575267791748047, + "learning_rate": 1.701986061869301e-05, + "loss": 2.6833, + "step": 6365500 + }, + { + "epoch": 1.978963795158906, + "grad_norm": 10.26268196105957, + "learning_rate": 1.7017270080684898e-05, + "loss": 2.6642, + "step": 6366000 + }, + { + "epoch": 1.979119227439393, + "grad_norm": 7.267053604125977, + "learning_rate": 1.7014679542676785e-05, + "loss": 2.7343, + "step": 6366500 + }, + { + "epoch": 1.9792746597198798, + "grad_norm": 8.092169761657715, + "learning_rate": 1.701208900466867e-05, + "loss": 2.6893, + "step": 6367000 + }, + { + "epoch": 1.9794300920003667, + "grad_norm": 7.915574073791504, + "learning_rate": 1.7009498466660552e-05, + "loss": 2.7382, + "step": 6367500 + }, + { + "epoch": 1.9795855242808535, + "grad_norm": 8.975067138671875, + "learning_rate": 1.700690792865244e-05, + "loss": 2.6968, + "step": 6368000 + }, + { + "epoch": 1.9797409565613404, + "grad_norm": 43.18204879760742, + "learning_rate": 1.7004317390644327e-05, + "loss": 2.6522, + "step": 6368500 + }, + { + "epoch": 1.9798963888418275, + "grad_norm": 9.84407901763916, + "learning_rate": 1.700172685263621e-05, + "loss": 2.6811, + "step": 6369000 + }, + { + "epoch": 1.9800518211223144, + "grad_norm": 10.906784057617188, + "learning_rate": 1.6999136314628094e-05, + "loss": 2.7278, + "step": 6369500 + }, + { + "epoch": 1.9802072534028012, + "grad_norm": 10.38092041015625, + "learning_rate": 1.699654577661998e-05, + "loss": 2.7049, + "step": 6370000 + }, + { + "epoch": 1.980362685683288, + "grad_norm": 10.042959213256836, + "learning_rate": 1.6993955238611865e-05, + "loss": 2.6641, + "step": 6370500 + }, + { + "epoch": 1.980518117963775, + "grad_norm": 18.018714904785156, + "learning_rate": 1.6991364700603752e-05, + "loss": 2.6612, + "step": 6371000 + }, + { + "epoch": 1.9806735502442618, + "grad_norm": 9.635720252990723, + "learning_rate": 1.6988774162595636e-05, + "loss": 2.6931, + "step": 6371500 + }, + { + "epoch": 1.9808289825247487, + "grad_norm": 12.625662803649902, + "learning_rate": 1.6986183624587523e-05, + "loss": 2.6947, + "step": 6372000 + }, + { + "epoch": 1.9809844148052356, + "grad_norm": 7.814671039581299, + "learning_rate": 1.6983593086579407e-05, + "loss": 2.7071, + "step": 6372500 + }, + { + "epoch": 1.9811398470857224, + "grad_norm": 10.412108421325684, + "learning_rate": 1.6981002548571294e-05, + "loss": 2.7053, + "step": 6373000 + }, + { + "epoch": 1.9812952793662093, + "grad_norm": 8.759142875671387, + "learning_rate": 1.697841201056318e-05, + "loss": 2.6838, + "step": 6373500 + }, + { + "epoch": 1.9814507116466962, + "grad_norm": 14.503210067749023, + "learning_rate": 1.6975821472555065e-05, + "loss": 2.6669, + "step": 6374000 + }, + { + "epoch": 1.981606143927183, + "grad_norm": 9.215109825134277, + "learning_rate": 1.697323093454695e-05, + "loss": 2.6794, + "step": 6374500 + }, + { + "epoch": 1.98176157620767, + "grad_norm": 9.752142906188965, + "learning_rate": 1.6970640396538833e-05, + "loss": 2.723, + "step": 6375000 + }, + { + "epoch": 1.981917008488157, + "grad_norm": 8.527563095092773, + "learning_rate": 1.696804985853072e-05, + "loss": 2.6933, + "step": 6375500 + }, + { + "epoch": 1.9820724407686439, + "grad_norm": 8.96672534942627, + "learning_rate": 1.6965459320522607e-05, + "loss": 2.6991, + "step": 6376000 + }, + { + "epoch": 1.9822278730491307, + "grad_norm": 13.44153881072998, + "learning_rate": 1.696286878251449e-05, + "loss": 2.7037, + "step": 6376500 + }, + { + "epoch": 1.9823833053296176, + "grad_norm": 9.963133811950684, + "learning_rate": 1.6960278244506374e-05, + "loss": 2.7366, + "step": 6377000 + }, + { + "epoch": 1.9825387376101045, + "grad_norm": 14.392462730407715, + "learning_rate": 1.695768770649826e-05, + "loss": 2.682, + "step": 6377500 + }, + { + "epoch": 1.9826941698905913, + "grad_norm": 7.4910173416137695, + "learning_rate": 1.695509716849015e-05, + "loss": 2.6353, + "step": 6378000 + }, + { + "epoch": 1.9828496021710782, + "grad_norm": 9.960234642028809, + "learning_rate": 1.6952506630482032e-05, + "loss": 2.7394, + "step": 6378500 + }, + { + "epoch": 1.983005034451565, + "grad_norm": 14.026169776916504, + "learning_rate": 1.694991609247392e-05, + "loss": 2.6465, + "step": 6379000 + }, + { + "epoch": 1.983160466732052, + "grad_norm": 11.172429084777832, + "learning_rate": 1.6947325554465803e-05, + "loss": 2.7127, + "step": 6379500 + }, + { + "epoch": 1.9833158990125388, + "grad_norm": 8.101212501525879, + "learning_rate": 1.6944735016457687e-05, + "loss": 2.7122, + "step": 6380000 + }, + { + "epoch": 1.9834713312930257, + "grad_norm": 10.937416076660156, + "learning_rate": 1.6942144478449574e-05, + "loss": 2.6701, + "step": 6380500 + }, + { + "epoch": 1.9836267635735125, + "grad_norm": 14.805218696594238, + "learning_rate": 1.693955394044146e-05, + "loss": 2.6935, + "step": 6381000 + }, + { + "epoch": 1.9837821958539994, + "grad_norm": 9.413047790527344, + "learning_rate": 1.6936963402433345e-05, + "loss": 2.7034, + "step": 6381500 + }, + { + "epoch": 1.9839376281344863, + "grad_norm": 8.603583335876465, + "learning_rate": 1.693437286442523e-05, + "loss": 2.7245, + "step": 6382000 + }, + { + "epoch": 1.9840930604149731, + "grad_norm": 8.05633544921875, + "learning_rate": 1.6931782326417116e-05, + "loss": 2.6683, + "step": 6382500 + }, + { + "epoch": 1.98424849269546, + "grad_norm": 10.6123046875, + "learning_rate": 1.6929191788409003e-05, + "loss": 2.7347, + "step": 6383000 + }, + { + "epoch": 1.9844039249759469, + "grad_norm": 8.655221939086914, + "learning_rate": 1.6926601250400887e-05, + "loss": 2.6733, + "step": 6383500 + }, + { + "epoch": 1.9845593572564337, + "grad_norm": 9.91203784942627, + "learning_rate": 1.692401071239277e-05, + "loss": 2.6619, + "step": 6384000 + }, + { + "epoch": 1.9847147895369206, + "grad_norm": 18.430355072021484, + "learning_rate": 1.6921420174384658e-05, + "loss": 2.7024, + "step": 6384500 + }, + { + "epoch": 1.9848702218174075, + "grad_norm": 8.07109546661377, + "learning_rate": 1.6918829636376542e-05, + "loss": 2.6792, + "step": 6385000 + }, + { + "epoch": 1.9850256540978943, + "grad_norm": 15.351162910461426, + "learning_rate": 1.691623909836843e-05, + "loss": 2.6659, + "step": 6385500 + }, + { + "epoch": 1.9851810863783812, + "grad_norm": 8.193418502807617, + "learning_rate": 1.6913648560360313e-05, + "loss": 2.7128, + "step": 6386000 + }, + { + "epoch": 1.985336518658868, + "grad_norm": 9.880047798156738, + "learning_rate": 1.69110580223522e-05, + "loss": 2.6906, + "step": 6386500 + }, + { + "epoch": 1.985491950939355, + "grad_norm": 22.78029441833496, + "learning_rate": 1.6908467484344084e-05, + "loss": 2.6893, + "step": 6387000 + }, + { + "epoch": 1.9856473832198418, + "grad_norm": 7.398043632507324, + "learning_rate": 1.6905876946335967e-05, + "loss": 2.7182, + "step": 6387500 + }, + { + "epoch": 1.9858028155003287, + "grad_norm": 11.685927391052246, + "learning_rate": 1.6903286408327858e-05, + "loss": 2.7003, + "step": 6388000 + }, + { + "epoch": 1.9859582477808155, + "grad_norm": 7.88861083984375, + "learning_rate": 1.690069587031974e-05, + "loss": 2.7124, + "step": 6388500 + }, + { + "epoch": 1.9861136800613024, + "grad_norm": 9.912285804748535, + "learning_rate": 1.6898105332311625e-05, + "loss": 2.6951, + "step": 6389000 + }, + { + "epoch": 1.9862691123417893, + "grad_norm": 10.641924858093262, + "learning_rate": 1.689551479430351e-05, + "loss": 2.706, + "step": 6389500 + }, + { + "epoch": 1.9864245446222761, + "grad_norm": 10.065526008605957, + "learning_rate": 1.6892924256295396e-05, + "loss": 2.6776, + "step": 6390000 + }, + { + "epoch": 1.986579976902763, + "grad_norm": 15.731060028076172, + "learning_rate": 1.6890333718287284e-05, + "loss": 2.7028, + "step": 6390500 + }, + { + "epoch": 1.9867354091832499, + "grad_norm": 16.459117889404297, + "learning_rate": 1.6887743180279167e-05, + "loss": 2.6879, + "step": 6391000 + }, + { + "epoch": 1.9868908414637367, + "grad_norm": 16.42732048034668, + "learning_rate": 1.6885152642271054e-05, + "loss": 2.716, + "step": 6391500 + }, + { + "epoch": 1.9870462737442236, + "grad_norm": 9.15670394897461, + "learning_rate": 1.6882562104262938e-05, + "loss": 2.7098, + "step": 6392000 + }, + { + "epoch": 1.9872017060247105, + "grad_norm": 10.730207443237305, + "learning_rate": 1.6879971566254822e-05, + "loss": 2.7006, + "step": 6392500 + }, + { + "epoch": 1.9873571383051973, + "grad_norm": 11.513872146606445, + "learning_rate": 1.687738102824671e-05, + "loss": 2.7734, + "step": 6393000 + }, + { + "epoch": 1.9875125705856844, + "grad_norm": 10.118189811706543, + "learning_rate": 1.6874790490238596e-05, + "loss": 2.7, + "step": 6393500 + }, + { + "epoch": 1.9876680028661713, + "grad_norm": 7.965907096862793, + "learning_rate": 1.687219995223048e-05, + "loss": 2.6779, + "step": 6394000 + }, + { + "epoch": 1.9878234351466582, + "grad_norm": 10.677748680114746, + "learning_rate": 1.6869609414222364e-05, + "loss": 2.6988, + "step": 6394500 + }, + { + "epoch": 1.987978867427145, + "grad_norm": 10.163673400878906, + "learning_rate": 1.686701887621425e-05, + "loss": 2.7083, + "step": 6395000 + }, + { + "epoch": 1.988134299707632, + "grad_norm": 8.73366928100586, + "learning_rate": 1.6864428338206138e-05, + "loss": 2.7119, + "step": 6395500 + }, + { + "epoch": 1.9882897319881188, + "grad_norm": 8.567639350891113, + "learning_rate": 1.6861837800198022e-05, + "loss": 2.6678, + "step": 6396000 + }, + { + "epoch": 1.9884451642686056, + "grad_norm": 10.087413787841797, + "learning_rate": 1.6859247262189906e-05, + "loss": 2.6603, + "step": 6396500 + }, + { + "epoch": 1.9886005965490925, + "grad_norm": 9.029168128967285, + "learning_rate": 1.6856656724181793e-05, + "loss": 2.7006, + "step": 6397000 + }, + { + "epoch": 1.9887560288295794, + "grad_norm": 19.005952835083008, + "learning_rate": 1.6854066186173677e-05, + "loss": 2.7009, + "step": 6397500 + }, + { + "epoch": 1.9889114611100662, + "grad_norm": 8.57641887664795, + "learning_rate": 1.6851475648165564e-05, + "loss": 2.6982, + "step": 6398000 + }, + { + "epoch": 1.989066893390553, + "grad_norm": 9.967829704284668, + "learning_rate": 1.6848885110157447e-05, + "loss": 2.7064, + "step": 6398500 + }, + { + "epoch": 1.98922232567104, + "grad_norm": 12.783876419067383, + "learning_rate": 1.6846294572149335e-05, + "loss": 2.7025, + "step": 6399000 + }, + { + "epoch": 1.989377757951527, + "grad_norm": 8.858989715576172, + "learning_rate": 1.684370403414122e-05, + "loss": 2.7445, + "step": 6399500 + }, + { + "epoch": 1.989533190232014, + "grad_norm": 10.320878982543945, + "learning_rate": 1.6841113496133106e-05, + "loss": 2.6874, + "step": 6400000 + }, + { + "epoch": 1.9896886225125008, + "grad_norm": 15.061822891235352, + "learning_rate": 1.6838522958124993e-05, + "loss": 2.6851, + "step": 6400500 + }, + { + "epoch": 1.9898440547929876, + "grad_norm": 9.119864463806152, + "learning_rate": 1.6835932420116876e-05, + "loss": 2.672, + "step": 6401000 + }, + { + "epoch": 1.9899994870734745, + "grad_norm": 19.336837768554688, + "learning_rate": 1.683334188210876e-05, + "loss": 2.731, + "step": 6401500 + }, + { + "epoch": 1.9901549193539614, + "grad_norm": 9.335796356201172, + "learning_rate": 1.6830751344100644e-05, + "loss": 2.6726, + "step": 6402000 + }, + { + "epoch": 1.9903103516344482, + "grad_norm": 9.241072654724121, + "learning_rate": 1.682816080609253e-05, + "loss": 2.6496, + "step": 6402500 + }, + { + "epoch": 1.9904657839149351, + "grad_norm": 12.297025680541992, + "learning_rate": 1.6825570268084418e-05, + "loss": 2.6576, + "step": 6403000 + }, + { + "epoch": 1.990621216195422, + "grad_norm": 12.163830757141113, + "learning_rate": 1.6822979730076302e-05, + "loss": 2.7174, + "step": 6403500 + }, + { + "epoch": 1.9907766484759088, + "grad_norm": 9.234121322631836, + "learning_rate": 1.6820389192068186e-05, + "loss": 2.6986, + "step": 6404000 + }, + { + "epoch": 1.9909320807563957, + "grad_norm": 10.091273307800293, + "learning_rate": 1.6817798654060073e-05, + "loss": 2.736, + "step": 6404500 + }, + { + "epoch": 1.9910875130368826, + "grad_norm": 25.032499313354492, + "learning_rate": 1.681520811605196e-05, + "loss": 2.7668, + "step": 6405000 + }, + { + "epoch": 1.9912429453173695, + "grad_norm": 8.885600090026855, + "learning_rate": 1.6812617578043844e-05, + "loss": 2.7324, + "step": 6405500 + }, + { + "epoch": 1.9913983775978563, + "grad_norm": 8.597947120666504, + "learning_rate": 1.681002704003573e-05, + "loss": 2.7479, + "step": 6406000 + }, + { + "epoch": 1.9915538098783432, + "grad_norm": 9.052772521972656, + "learning_rate": 1.6807436502027615e-05, + "loss": 2.6714, + "step": 6406500 + }, + { + "epoch": 1.99170924215883, + "grad_norm": 10.296463012695312, + "learning_rate": 1.68048459640195e-05, + "loss": 2.6656, + "step": 6407000 + }, + { + "epoch": 1.991864674439317, + "grad_norm": 19.137300491333008, + "learning_rate": 1.6802255426011386e-05, + "loss": 2.7698, + "step": 6407500 + }, + { + "epoch": 1.9920201067198038, + "grad_norm": 7.1319427490234375, + "learning_rate": 1.6799664888003273e-05, + "loss": 2.6735, + "step": 6408000 + }, + { + "epoch": 1.9921755390002907, + "grad_norm": 12.0159273147583, + "learning_rate": 1.6797074349995157e-05, + "loss": 2.6514, + "step": 6408500 + }, + { + "epoch": 1.9923309712807775, + "grad_norm": 8.833171844482422, + "learning_rate": 1.679448381198704e-05, + "loss": 2.6962, + "step": 6409000 + }, + { + "epoch": 1.9924864035612644, + "grad_norm": 9.043194770812988, + "learning_rate": 1.6791893273978928e-05, + "loss": 2.679, + "step": 6409500 + }, + { + "epoch": 1.9926418358417513, + "grad_norm": 9.504008293151855, + "learning_rate": 1.6789302735970815e-05, + "loss": 2.7066, + "step": 6410000 + }, + { + "epoch": 1.9927972681222381, + "grad_norm": 11.163658142089844, + "learning_rate": 1.67867121979627e-05, + "loss": 2.68, + "step": 6410500 + }, + { + "epoch": 1.992952700402725, + "grad_norm": 11.373676300048828, + "learning_rate": 1.6784121659954582e-05, + "loss": 2.6806, + "step": 6411000 + }, + { + "epoch": 1.9931081326832119, + "grad_norm": 6.810459136962891, + "learning_rate": 1.678153112194647e-05, + "loss": 2.7324, + "step": 6411500 + }, + { + "epoch": 1.9932635649636987, + "grad_norm": 11.04246997833252, + "learning_rate": 1.6778940583938353e-05, + "loss": 2.7418, + "step": 6412000 + }, + { + "epoch": 1.9934189972441856, + "grad_norm": 9.277649879455566, + "learning_rate": 1.677635004593024e-05, + "loss": 2.6301, + "step": 6412500 + }, + { + "epoch": 1.9935744295246725, + "grad_norm": 13.067251205444336, + "learning_rate": 1.6773759507922124e-05, + "loss": 2.7088, + "step": 6413000 + }, + { + "epoch": 1.9937298618051593, + "grad_norm": 9.457334518432617, + "learning_rate": 1.677116896991401e-05, + "loss": 2.6937, + "step": 6413500 + }, + { + "epoch": 1.9938852940856462, + "grad_norm": 9.2792329788208, + "learning_rate": 1.6768578431905895e-05, + "loss": 2.7085, + "step": 6414000 + }, + { + "epoch": 1.994040726366133, + "grad_norm": 10.746906280517578, + "learning_rate": 1.676598789389778e-05, + "loss": 2.662, + "step": 6414500 + }, + { + "epoch": 1.99419615864662, + "grad_norm": 11.893027305603027, + "learning_rate": 1.676339735588967e-05, + "loss": 2.6959, + "step": 6415000 + }, + { + "epoch": 1.9943515909271068, + "grad_norm": 11.92031478881836, + "learning_rate": 1.6760806817881553e-05, + "loss": 2.65, + "step": 6415500 + }, + { + "epoch": 1.9945070232075937, + "grad_norm": 10.210070610046387, + "learning_rate": 1.6758216279873437e-05, + "loss": 2.685, + "step": 6416000 + }, + { + "epoch": 1.9946624554880805, + "grad_norm": 9.967700004577637, + "learning_rate": 1.675562574186532e-05, + "loss": 2.634, + "step": 6416500 + }, + { + "epoch": 1.9948178877685674, + "grad_norm": 9.739489555358887, + "learning_rate": 1.6753035203857208e-05, + "loss": 2.7135, + "step": 6417000 + }, + { + "epoch": 1.9949733200490545, + "grad_norm": 11.992161750793457, + "learning_rate": 1.6750444665849095e-05, + "loss": 2.7104, + "step": 6417500 + }, + { + "epoch": 1.9951287523295413, + "grad_norm": 10.64781665802002, + "learning_rate": 1.674785412784098e-05, + "loss": 2.7018, + "step": 6418000 + }, + { + "epoch": 1.9952841846100282, + "grad_norm": 10.80407428741455, + "learning_rate": 1.6745263589832866e-05, + "loss": 2.6955, + "step": 6418500 + }, + { + "epoch": 1.995439616890515, + "grad_norm": 14.200039863586426, + "learning_rate": 1.674267305182475e-05, + "loss": 2.697, + "step": 6419000 + }, + { + "epoch": 1.995595049171002, + "grad_norm": 10.447237968444824, + "learning_rate": 1.6740082513816633e-05, + "loss": 2.6907, + "step": 6419500 + }, + { + "epoch": 1.9957504814514888, + "grad_norm": 10.318354606628418, + "learning_rate": 1.673749197580852e-05, + "loss": 2.6832, + "step": 6420000 + }, + { + "epoch": 1.9959059137319757, + "grad_norm": 10.646127700805664, + "learning_rate": 1.6734901437800408e-05, + "loss": 2.6798, + "step": 6420500 + }, + { + "epoch": 1.9960613460124625, + "grad_norm": 73.57157135009766, + "learning_rate": 1.673231089979229e-05, + "loss": 2.702, + "step": 6421000 + }, + { + "epoch": 1.9962167782929494, + "grad_norm": 8.94066047668457, + "learning_rate": 1.6729720361784175e-05, + "loss": 2.7238, + "step": 6421500 + }, + { + "epoch": 1.9963722105734363, + "grad_norm": 11.330647468566895, + "learning_rate": 1.6727129823776062e-05, + "loss": 2.7114, + "step": 6422000 + }, + { + "epoch": 1.9965276428539231, + "grad_norm": 8.270590782165527, + "learning_rate": 1.672453928576795e-05, + "loss": 2.6902, + "step": 6422500 + }, + { + "epoch": 1.99668307513441, + "grad_norm": 15.941272735595703, + "learning_rate": 1.6721948747759833e-05, + "loss": 2.7256, + "step": 6423000 + }, + { + "epoch": 1.996838507414897, + "grad_norm": 10.302440643310547, + "learning_rate": 1.6719358209751717e-05, + "loss": 2.6708, + "step": 6423500 + }, + { + "epoch": 1.996993939695384, + "grad_norm": 8.344038009643555, + "learning_rate": 1.6716767671743604e-05, + "loss": 2.6959, + "step": 6424000 + }, + { + "epoch": 1.9971493719758708, + "grad_norm": 8.64273452758789, + "learning_rate": 1.6714177133735488e-05, + "loss": 2.6878, + "step": 6424500 + }, + { + "epoch": 1.9973048042563577, + "grad_norm": 39.78676223754883, + "learning_rate": 1.6711586595727375e-05, + "loss": 2.7074, + "step": 6425000 + }, + { + "epoch": 1.9974602365368446, + "grad_norm": 11.871232986450195, + "learning_rate": 1.670899605771926e-05, + "loss": 2.6959, + "step": 6425500 + }, + { + "epoch": 1.9976156688173314, + "grad_norm": 11.391698837280273, + "learning_rate": 1.6706405519711146e-05, + "loss": 2.725, + "step": 6426000 + }, + { + "epoch": 1.9977711010978183, + "grad_norm": 9.49240779876709, + "learning_rate": 1.670381498170303e-05, + "loss": 2.6708, + "step": 6426500 + }, + { + "epoch": 1.9979265333783052, + "grad_norm": 9.632977485656738, + "learning_rate": 1.6701224443694917e-05, + "loss": 2.7205, + "step": 6427000 + }, + { + "epoch": 1.998081965658792, + "grad_norm": 22.438718795776367, + "learning_rate": 1.6698633905686804e-05, + "loss": 2.6888, + "step": 6427500 + }, + { + "epoch": 1.998237397939279, + "grad_norm": 13.906216621398926, + "learning_rate": 1.6696043367678688e-05, + "loss": 2.6897, + "step": 6428000 + }, + { + "epoch": 1.9983928302197658, + "grad_norm": 8.254311561584473, + "learning_rate": 1.669345282967057e-05, + "loss": 2.7241, + "step": 6428500 + }, + { + "epoch": 1.9985482625002526, + "grad_norm": 10.294663429260254, + "learning_rate": 1.6690862291662455e-05, + "loss": 2.6735, + "step": 6429000 + }, + { + "epoch": 1.9987036947807395, + "grad_norm": 13.106575012207031, + "learning_rate": 1.6688271753654343e-05, + "loss": 2.6631, + "step": 6429500 + }, + { + "epoch": 1.9988591270612264, + "grad_norm": 8.120759010314941, + "learning_rate": 1.668568121564623e-05, + "loss": 2.6667, + "step": 6430000 + }, + { + "epoch": 1.9990145593417132, + "grad_norm": 38.5419921875, + "learning_rate": 1.6683090677638113e-05, + "loss": 2.7286, + "step": 6430500 + }, + { + "epoch": 1.9991699916222, + "grad_norm": 9.600727081298828, + "learning_rate": 1.6680500139629997e-05, + "loss": 2.7019, + "step": 6431000 + }, + { + "epoch": 1.999325423902687, + "grad_norm": 11.198668479919434, + "learning_rate": 1.6677909601621884e-05, + "loss": 2.7116, + "step": 6431500 + }, + { + "epoch": 1.9994808561831738, + "grad_norm": 8.529614448547363, + "learning_rate": 1.667531906361377e-05, + "loss": 2.7478, + "step": 6432000 + }, + { + "epoch": 1.9996362884636607, + "grad_norm": 10.938284873962402, + "learning_rate": 1.6672728525605655e-05, + "loss": 2.6555, + "step": 6432500 + }, + { + "epoch": 1.9997917207441476, + "grad_norm": 11.252143859863281, + "learning_rate": 1.6670137987597542e-05, + "loss": 2.6866, + "step": 6433000 + }, + { + "epoch": 1.9999471530246344, + "grad_norm": 8.083330154418945, + "learning_rate": 1.6667547449589426e-05, + "loss": 2.6888, + "step": 6433500 + }, + { + "epoch": 2.0001025853051213, + "grad_norm": 8.900067329406738, + "learning_rate": 1.666495691158131e-05, + "loss": 2.6804, + "step": 6434000 + }, + { + "epoch": 2.000258017585608, + "grad_norm": 9.658205032348633, + "learning_rate": 1.6662366373573197e-05, + "loss": 2.7319, + "step": 6434500 + }, + { + "epoch": 2.000413449866095, + "grad_norm": 9.0910062789917, + "learning_rate": 1.6659775835565084e-05, + "loss": 2.6554, + "step": 6435000 + }, + { + "epoch": 2.000568882146582, + "grad_norm": 13.724339485168457, + "learning_rate": 1.6657185297556968e-05, + "loss": 2.6922, + "step": 6435500 + }, + { + "epoch": 2.0007243144270688, + "grad_norm": 7.327503204345703, + "learning_rate": 1.6654594759548852e-05, + "loss": 2.6306, + "step": 6436000 + }, + { + "epoch": 2.0008797467075556, + "grad_norm": 9.775185585021973, + "learning_rate": 1.665200422154074e-05, + "loss": 2.6761, + "step": 6436500 + }, + { + "epoch": 2.0010351789880425, + "grad_norm": 10.63143253326416, + "learning_rate": 1.6649413683532626e-05, + "loss": 2.6786, + "step": 6437000 + }, + { + "epoch": 2.0011906112685294, + "grad_norm": 15.4645414352417, + "learning_rate": 1.664682314552451e-05, + "loss": 2.6899, + "step": 6437500 + }, + { + "epoch": 2.0013460435490162, + "grad_norm": 8.998997688293457, + "learning_rate": 1.6644232607516394e-05, + "loss": 2.727, + "step": 6438000 + }, + { + "epoch": 2.001501475829503, + "grad_norm": 9.762499809265137, + "learning_rate": 1.664164206950828e-05, + "loss": 2.6903, + "step": 6438500 + }, + { + "epoch": 2.00165690810999, + "grad_norm": 10.734631538391113, + "learning_rate": 1.6639051531500165e-05, + "loss": 2.6919, + "step": 6439000 + }, + { + "epoch": 2.001812340390477, + "grad_norm": 8.690749168395996, + "learning_rate": 1.6636460993492052e-05, + "loss": 2.6998, + "step": 6439500 + }, + { + "epoch": 2.0019677726709637, + "grad_norm": 9.48694896697998, + "learning_rate": 1.6633870455483936e-05, + "loss": 2.7208, + "step": 6440000 + }, + { + "epoch": 2.0021232049514506, + "grad_norm": 8.628194808959961, + "learning_rate": 1.6631279917475823e-05, + "loss": 2.6662, + "step": 6440500 + }, + { + "epoch": 2.0022786372319374, + "grad_norm": 11.965845108032227, + "learning_rate": 1.6628689379467706e-05, + "loss": 2.6947, + "step": 6441000 + }, + { + "epoch": 2.0024340695124243, + "grad_norm": 9.038287162780762, + "learning_rate": 1.6626098841459594e-05, + "loss": 2.66, + "step": 6441500 + }, + { + "epoch": 2.002589501792911, + "grad_norm": 9.428229331970215, + "learning_rate": 1.662350830345148e-05, + "loss": 2.6743, + "step": 6442000 + }, + { + "epoch": 2.002744934073398, + "grad_norm": 7.622143745422363, + "learning_rate": 1.6620917765443365e-05, + "loss": 2.6945, + "step": 6442500 + }, + { + "epoch": 2.002900366353885, + "grad_norm": 22.636173248291016, + "learning_rate": 1.6618327227435248e-05, + "loss": 2.6829, + "step": 6443000 + }, + { + "epoch": 2.0030557986343718, + "grad_norm": 12.86312198638916, + "learning_rate": 1.6615736689427132e-05, + "loss": 2.6787, + "step": 6443500 + }, + { + "epoch": 2.0032112309148586, + "grad_norm": 10.938852310180664, + "learning_rate": 1.661314615141902e-05, + "loss": 2.6943, + "step": 6444000 + }, + { + "epoch": 2.003366663195346, + "grad_norm": 9.324398040771484, + "learning_rate": 1.6610555613410906e-05, + "loss": 2.6717, + "step": 6444500 + }, + { + "epoch": 2.003522095475833, + "grad_norm": 12.202657699584961, + "learning_rate": 1.660796507540279e-05, + "loss": 2.7338, + "step": 6445000 + }, + { + "epoch": 2.0036775277563197, + "grad_norm": 9.547932624816895, + "learning_rate": 1.6605374537394677e-05, + "loss": 2.7226, + "step": 6445500 + }, + { + "epoch": 2.0038329600368066, + "grad_norm": 8.299005508422852, + "learning_rate": 1.660278399938656e-05, + "loss": 2.7047, + "step": 6446000 + }, + { + "epoch": 2.0039883923172934, + "grad_norm": 9.419748306274414, + "learning_rate": 1.6600193461378448e-05, + "loss": 2.6899, + "step": 6446500 + }, + { + "epoch": 2.0041438245977803, + "grad_norm": 8.865265846252441, + "learning_rate": 1.6597602923370332e-05, + "loss": 2.7354, + "step": 6447000 + }, + { + "epoch": 2.004299256878267, + "grad_norm": 8.564118385314941, + "learning_rate": 1.659501238536222e-05, + "loss": 2.6909, + "step": 6447500 + }, + { + "epoch": 2.004454689158754, + "grad_norm": 12.863534927368164, + "learning_rate": 1.6592421847354103e-05, + "loss": 2.6933, + "step": 6448000 + }, + { + "epoch": 2.004610121439241, + "grad_norm": 9.48692512512207, + "learning_rate": 1.6589831309345987e-05, + "loss": 2.7635, + "step": 6448500 + }, + { + "epoch": 2.0047655537197278, + "grad_norm": 11.246330261230469, + "learning_rate": 1.6587240771337874e-05, + "loss": 2.6704, + "step": 6449000 + }, + { + "epoch": 2.0049209860002146, + "grad_norm": 12.011908531188965, + "learning_rate": 1.658465023332976e-05, + "loss": 2.6899, + "step": 6449500 + }, + { + "epoch": 2.0050764182807015, + "grad_norm": 16.97980499267578, + "learning_rate": 1.6582059695321645e-05, + "loss": 2.6811, + "step": 6450000 + }, + { + "epoch": 2.0052318505611884, + "grad_norm": 8.854873657226562, + "learning_rate": 1.657946915731353e-05, + "loss": 2.6412, + "step": 6450500 + }, + { + "epoch": 2.0053872828416752, + "grad_norm": 14.080272674560547, + "learning_rate": 1.6576878619305416e-05, + "loss": 2.6917, + "step": 6451000 + }, + { + "epoch": 2.005542715122162, + "grad_norm": 9.183526039123535, + "learning_rate": 1.6574288081297303e-05, + "loss": 2.6928, + "step": 6451500 + }, + { + "epoch": 2.005698147402649, + "grad_norm": 9.240008354187012, + "learning_rate": 1.6571697543289187e-05, + "loss": 2.7117, + "step": 6452000 + }, + { + "epoch": 2.005853579683136, + "grad_norm": 8.282727241516113, + "learning_rate": 1.656910700528107e-05, + "loss": 2.6633, + "step": 6452500 + }, + { + "epoch": 2.0060090119636227, + "grad_norm": 9.729737281799316, + "learning_rate": 1.6566516467272957e-05, + "loss": 2.6756, + "step": 6453000 + }, + { + "epoch": 2.0061644442441096, + "grad_norm": 8.45427131652832, + "learning_rate": 1.656392592926484e-05, + "loss": 2.7101, + "step": 6453500 + }, + { + "epoch": 2.0063198765245964, + "grad_norm": 7.685477256774902, + "learning_rate": 1.656133539125673e-05, + "loss": 2.7007, + "step": 6454000 + }, + { + "epoch": 2.0064753088050833, + "grad_norm": 8.731311798095703, + "learning_rate": 1.6558744853248616e-05, + "loss": 2.709, + "step": 6454500 + }, + { + "epoch": 2.00663074108557, + "grad_norm": 16.14970588684082, + "learning_rate": 1.65561543152405e-05, + "loss": 2.681, + "step": 6455000 + }, + { + "epoch": 2.006786173366057, + "grad_norm": 19.54043960571289, + "learning_rate": 1.6553563777232383e-05, + "loss": 2.6575, + "step": 6455500 + }, + { + "epoch": 2.006941605646544, + "grad_norm": 8.900075912475586, + "learning_rate": 1.6550973239224267e-05, + "loss": 2.6681, + "step": 6456000 + }, + { + "epoch": 2.0070970379270308, + "grad_norm": 11.264548301696777, + "learning_rate": 1.6548382701216157e-05, + "loss": 2.747, + "step": 6456500 + }, + { + "epoch": 2.0072524702075176, + "grad_norm": 10.58898639678955, + "learning_rate": 1.654579216320804e-05, + "loss": 2.6447, + "step": 6457000 + }, + { + "epoch": 2.0074079024880045, + "grad_norm": 8.25118350982666, + "learning_rate": 1.6543201625199925e-05, + "loss": 2.6527, + "step": 6457500 + }, + { + "epoch": 2.0075633347684914, + "grad_norm": 11.251602172851562, + "learning_rate": 1.654061108719181e-05, + "loss": 2.7234, + "step": 6458000 + }, + { + "epoch": 2.0077187670489782, + "grad_norm": 11.513348579406738, + "learning_rate": 1.6538020549183696e-05, + "loss": 2.6991, + "step": 6458500 + }, + { + "epoch": 2.007874199329465, + "grad_norm": 14.713044166564941, + "learning_rate": 1.6535430011175583e-05, + "loss": 2.6615, + "step": 6459000 + }, + { + "epoch": 2.008029631609952, + "grad_norm": 12.84921932220459, + "learning_rate": 1.6532839473167467e-05, + "loss": 2.7186, + "step": 6459500 + }, + { + "epoch": 2.008185063890439, + "grad_norm": 8.866204261779785, + "learning_rate": 1.6530248935159354e-05, + "loss": 2.6946, + "step": 6460000 + }, + { + "epoch": 2.0083404961709257, + "grad_norm": 29.302043914794922, + "learning_rate": 1.6527658397151238e-05, + "loss": 2.6946, + "step": 6460500 + }, + { + "epoch": 2.0084959284514126, + "grad_norm": 11.18673324584961, + "learning_rate": 1.652506785914312e-05, + "loss": 2.6949, + "step": 6461000 + }, + { + "epoch": 2.0086513607318994, + "grad_norm": 8.700379371643066, + "learning_rate": 1.652247732113501e-05, + "loss": 2.689, + "step": 6461500 + }, + { + "epoch": 2.0088067930123863, + "grad_norm": 8.242537498474121, + "learning_rate": 1.6519886783126896e-05, + "loss": 2.6764, + "step": 6462000 + }, + { + "epoch": 2.008962225292873, + "grad_norm": 11.127847671508789, + "learning_rate": 1.651729624511878e-05, + "loss": 2.7132, + "step": 6462500 + }, + { + "epoch": 2.00911765757336, + "grad_norm": 9.298707008361816, + "learning_rate": 1.6514705707110663e-05, + "loss": 2.6957, + "step": 6463000 + }, + { + "epoch": 2.009273089853847, + "grad_norm": 13.776211738586426, + "learning_rate": 1.651211516910255e-05, + "loss": 2.7016, + "step": 6463500 + }, + { + "epoch": 2.0094285221343338, + "grad_norm": 11.157293319702148, + "learning_rate": 1.6509524631094438e-05, + "loss": 2.7192, + "step": 6464000 + }, + { + "epoch": 2.0095839544148206, + "grad_norm": 14.4839448928833, + "learning_rate": 1.650693409308632e-05, + "loss": 2.7041, + "step": 6464500 + }, + { + "epoch": 2.0097393866953075, + "grad_norm": 7.813401699066162, + "learning_rate": 1.6504343555078205e-05, + "loss": 2.6843, + "step": 6465000 + }, + { + "epoch": 2.0098948189757944, + "grad_norm": 11.114623069763184, + "learning_rate": 1.6501753017070092e-05, + "loss": 2.6782, + "step": 6465500 + }, + { + "epoch": 2.0100502512562812, + "grad_norm": 8.992375373840332, + "learning_rate": 1.6499162479061976e-05, + "loss": 2.6306, + "step": 6466000 + }, + { + "epoch": 2.010205683536768, + "grad_norm": 9.001315116882324, + "learning_rate": 1.6496571941053863e-05, + "loss": 2.6782, + "step": 6466500 + }, + { + "epoch": 2.010361115817255, + "grad_norm": 14.060667037963867, + "learning_rate": 1.6493981403045747e-05, + "loss": 2.6612, + "step": 6467000 + }, + { + "epoch": 2.010516548097742, + "grad_norm": 11.850557327270508, + "learning_rate": 1.6491390865037634e-05, + "loss": 2.6809, + "step": 6467500 + }, + { + "epoch": 2.0106719803782287, + "grad_norm": 9.88760757446289, + "learning_rate": 1.6488800327029518e-05, + "loss": 2.6976, + "step": 6468000 + }, + { + "epoch": 2.010827412658716, + "grad_norm": 8.66817569732666, + "learning_rate": 1.6486209789021405e-05, + "loss": 2.6682, + "step": 6468500 + }, + { + "epoch": 2.010982844939203, + "grad_norm": 7.890036106109619, + "learning_rate": 1.6483619251013292e-05, + "loss": 2.7264, + "step": 6469000 + }, + { + "epoch": 2.0111382772196897, + "grad_norm": 9.570385932922363, + "learning_rate": 1.6481028713005176e-05, + "loss": 2.6915, + "step": 6469500 + }, + { + "epoch": 2.0112937095001766, + "grad_norm": 9.50760555267334, + "learning_rate": 1.647843817499706e-05, + "loss": 2.7154, + "step": 6470000 + }, + { + "epoch": 2.0114491417806635, + "grad_norm": 8.90040111541748, + "learning_rate": 1.6475847636988943e-05, + "loss": 2.6589, + "step": 6470500 + }, + { + "epoch": 2.0116045740611503, + "grad_norm": 13.356534957885742, + "learning_rate": 1.647325709898083e-05, + "loss": 2.6795, + "step": 6471000 + }, + { + "epoch": 2.011760006341637, + "grad_norm": 8.76031494140625, + "learning_rate": 1.6470666560972718e-05, + "loss": 2.7136, + "step": 6471500 + }, + { + "epoch": 2.011915438622124, + "grad_norm": 8.741905212402344, + "learning_rate": 1.64680760229646e-05, + "loss": 2.6982, + "step": 6472000 + }, + { + "epoch": 2.012070870902611, + "grad_norm": 9.721760749816895, + "learning_rate": 1.646548548495649e-05, + "loss": 2.7009, + "step": 6472500 + }, + { + "epoch": 2.012226303183098, + "grad_norm": 10.843547821044922, + "learning_rate": 1.6462894946948372e-05, + "loss": 2.6727, + "step": 6473000 + }, + { + "epoch": 2.0123817354635847, + "grad_norm": 12.141182899475098, + "learning_rate": 1.646030440894026e-05, + "loss": 2.6924, + "step": 6473500 + }, + { + "epoch": 2.0125371677440715, + "grad_norm": 11.254984855651855, + "learning_rate": 1.6457713870932143e-05, + "loss": 2.6735, + "step": 6474000 + }, + { + "epoch": 2.0126926000245584, + "grad_norm": 11.341414451599121, + "learning_rate": 1.645512333292403e-05, + "loss": 2.6548, + "step": 6474500 + }, + { + "epoch": 2.0128480323050453, + "grad_norm": 10.30846881866455, + "learning_rate": 1.6452532794915914e-05, + "loss": 2.6463, + "step": 6475000 + }, + { + "epoch": 2.013003464585532, + "grad_norm": 9.154912948608398, + "learning_rate": 1.6449942256907798e-05, + "loss": 2.6952, + "step": 6475500 + }, + { + "epoch": 2.013158896866019, + "grad_norm": 9.516525268554688, + "learning_rate": 1.6447351718899685e-05, + "loss": 2.7151, + "step": 6476000 + }, + { + "epoch": 2.013314329146506, + "grad_norm": 12.241786003112793, + "learning_rate": 1.6444761180891572e-05, + "loss": 2.7149, + "step": 6476500 + }, + { + "epoch": 2.0134697614269927, + "grad_norm": 7.77971887588501, + "learning_rate": 1.6442170642883456e-05, + "loss": 2.6895, + "step": 6477000 + }, + { + "epoch": 2.0136251937074796, + "grad_norm": 9.670820236206055, + "learning_rate": 1.643958010487534e-05, + "loss": 2.7094, + "step": 6477500 + }, + { + "epoch": 2.0137806259879665, + "grad_norm": 13.206332206726074, + "learning_rate": 1.6436989566867227e-05, + "loss": 2.6425, + "step": 6478000 + }, + { + "epoch": 2.0139360582684533, + "grad_norm": 10.194438934326172, + "learning_rate": 1.6434399028859114e-05, + "loss": 2.66, + "step": 6478500 + }, + { + "epoch": 2.01409149054894, + "grad_norm": 26.05054473876953, + "learning_rate": 1.6431808490850998e-05, + "loss": 2.7214, + "step": 6479000 + }, + { + "epoch": 2.014246922829427, + "grad_norm": 6.831974983215332, + "learning_rate": 1.6429217952842882e-05, + "loss": 2.6378, + "step": 6479500 + }, + { + "epoch": 2.014402355109914, + "grad_norm": 8.998430252075195, + "learning_rate": 1.642662741483477e-05, + "loss": 2.7089, + "step": 6480000 + }, + { + "epoch": 2.014557787390401, + "grad_norm": 8.582459449768066, + "learning_rate": 1.6424036876826653e-05, + "loss": 2.6652, + "step": 6480500 + }, + { + "epoch": 2.0147132196708877, + "grad_norm": 21.410598754882812, + "learning_rate": 1.642144633881854e-05, + "loss": 2.7196, + "step": 6481000 + }, + { + "epoch": 2.0148686519513745, + "grad_norm": 9.418722152709961, + "learning_rate": 1.6418855800810427e-05, + "loss": 2.721, + "step": 6481500 + }, + { + "epoch": 2.0150240842318614, + "grad_norm": 35.74826431274414, + "learning_rate": 1.641626526280231e-05, + "loss": 2.7158, + "step": 6482000 + }, + { + "epoch": 2.0151795165123483, + "grad_norm": 7.971682071685791, + "learning_rate": 1.6413674724794194e-05, + "loss": 2.6786, + "step": 6482500 + }, + { + "epoch": 2.015334948792835, + "grad_norm": 9.055005073547363, + "learning_rate": 1.6411084186786078e-05, + "loss": 2.7552, + "step": 6483000 + }, + { + "epoch": 2.015490381073322, + "grad_norm": 8.716880798339844, + "learning_rate": 1.640849364877797e-05, + "loss": 2.712, + "step": 6483500 + }, + { + "epoch": 2.015645813353809, + "grad_norm": 16.677343368530273, + "learning_rate": 1.6405903110769853e-05, + "loss": 2.6833, + "step": 6484000 + }, + { + "epoch": 2.0158012456342957, + "grad_norm": 10.227811813354492, + "learning_rate": 1.6403312572761736e-05, + "loss": 2.6576, + "step": 6484500 + }, + { + "epoch": 2.0159566779147826, + "grad_norm": 10.776406288146973, + "learning_rate": 1.640072203475362e-05, + "loss": 2.678, + "step": 6485000 + }, + { + "epoch": 2.0161121101952695, + "grad_norm": 10.207626342773438, + "learning_rate": 1.6398131496745507e-05, + "loss": 2.7399, + "step": 6485500 + }, + { + "epoch": 2.0162675424757563, + "grad_norm": 9.830344200134277, + "learning_rate": 1.6395540958737394e-05, + "loss": 2.6706, + "step": 6486000 + }, + { + "epoch": 2.016422974756243, + "grad_norm": 7.759247779846191, + "learning_rate": 1.6392950420729278e-05, + "loss": 2.6841, + "step": 6486500 + }, + { + "epoch": 2.01657840703673, + "grad_norm": 8.590579986572266, + "learning_rate": 1.6390359882721165e-05, + "loss": 2.6803, + "step": 6487000 + }, + { + "epoch": 2.016733839317217, + "grad_norm": 10.902907371520996, + "learning_rate": 1.638776934471305e-05, + "loss": 2.7026, + "step": 6487500 + }, + { + "epoch": 2.016889271597704, + "grad_norm": 8.361395835876465, + "learning_rate": 1.6385178806704933e-05, + "loss": 2.6739, + "step": 6488000 + }, + { + "epoch": 2.0170447038781907, + "grad_norm": 6.524913787841797, + "learning_rate": 1.638258826869682e-05, + "loss": 2.7211, + "step": 6488500 + }, + { + "epoch": 2.0172001361586775, + "grad_norm": 15.23338508605957, + "learning_rate": 1.6379997730688707e-05, + "loss": 2.6962, + "step": 6489000 + }, + { + "epoch": 2.0173555684391644, + "grad_norm": 12.58072280883789, + "learning_rate": 1.637740719268059e-05, + "loss": 2.7008, + "step": 6489500 + }, + { + "epoch": 2.0175110007196513, + "grad_norm": 10.544635772705078, + "learning_rate": 1.6374816654672475e-05, + "loss": 2.7276, + "step": 6490000 + }, + { + "epoch": 2.017666433000138, + "grad_norm": 10.911389350891113, + "learning_rate": 1.6372226116664362e-05, + "loss": 2.6805, + "step": 6490500 + }, + { + "epoch": 2.017821865280625, + "grad_norm": 6.826104164123535, + "learning_rate": 1.636963557865625e-05, + "loss": 2.676, + "step": 6491000 + }, + { + "epoch": 2.017977297561112, + "grad_norm": 16.620500564575195, + "learning_rate": 1.6367045040648133e-05, + "loss": 2.7021, + "step": 6491500 + }, + { + "epoch": 2.0181327298415987, + "grad_norm": 10.994673728942871, + "learning_rate": 1.6364454502640017e-05, + "loss": 2.669, + "step": 6492000 + }, + { + "epoch": 2.0182881621220856, + "grad_norm": 8.447381019592285, + "learning_rate": 1.6361863964631904e-05, + "loss": 2.6788, + "step": 6492500 + }, + { + "epoch": 2.018443594402573, + "grad_norm": 9.640460968017578, + "learning_rate": 1.6359273426623787e-05, + "loss": 2.6589, + "step": 6493000 + }, + { + "epoch": 2.01859902668306, + "grad_norm": 13.980912208557129, + "learning_rate": 1.6356682888615675e-05, + "loss": 2.6432, + "step": 6493500 + }, + { + "epoch": 2.0187544589635467, + "grad_norm": 34.995361328125, + "learning_rate": 1.635409235060756e-05, + "loss": 2.7589, + "step": 6494000 + }, + { + "epoch": 2.0189098912440335, + "grad_norm": 10.248311042785645, + "learning_rate": 1.6351501812599445e-05, + "loss": 2.6908, + "step": 6494500 + }, + { + "epoch": 2.0190653235245204, + "grad_norm": 9.190506935119629, + "learning_rate": 1.634891127459133e-05, + "loss": 2.619, + "step": 6495000 + }, + { + "epoch": 2.0192207558050073, + "grad_norm": 10.828315734863281, + "learning_rate": 1.6346320736583216e-05, + "loss": 2.6961, + "step": 6495500 + }, + { + "epoch": 2.019376188085494, + "grad_norm": 10.956635475158691, + "learning_rate": 1.6343730198575104e-05, + "loss": 2.6936, + "step": 6496000 + }, + { + "epoch": 2.019531620365981, + "grad_norm": 11.513629913330078, + "learning_rate": 1.6341139660566987e-05, + "loss": 2.7108, + "step": 6496500 + }, + { + "epoch": 2.019687052646468, + "grad_norm": 10.150827407836914, + "learning_rate": 1.633854912255887e-05, + "loss": 2.6717, + "step": 6497000 + }, + { + "epoch": 2.0198424849269547, + "grad_norm": 8.666056632995605, + "learning_rate": 1.6335958584550755e-05, + "loss": 2.6745, + "step": 6497500 + }, + { + "epoch": 2.0199979172074416, + "grad_norm": 10.490804672241211, + "learning_rate": 1.6333368046542642e-05, + "loss": 2.667, + "step": 6498000 + }, + { + "epoch": 2.0201533494879285, + "grad_norm": 8.197916984558105, + "learning_rate": 1.633077750853453e-05, + "loss": 2.6658, + "step": 6498500 + }, + { + "epoch": 2.0203087817684153, + "grad_norm": 10.129631042480469, + "learning_rate": 1.6328186970526413e-05, + "loss": 2.6963, + "step": 6499000 + }, + { + "epoch": 2.020464214048902, + "grad_norm": 10.088994026184082, + "learning_rate": 1.63255964325183e-05, + "loss": 2.698, + "step": 6499500 + }, + { + "epoch": 2.020619646329389, + "grad_norm": 11.507957458496094, + "learning_rate": 1.6323005894510184e-05, + "loss": 2.7011, + "step": 6500000 + }, + { + "epoch": 2.020775078609876, + "grad_norm": 10.744966506958008, + "learning_rate": 1.632041535650207e-05, + "loss": 2.6911, + "step": 6500500 + }, + { + "epoch": 2.020930510890363, + "grad_norm": 8.667263984680176, + "learning_rate": 1.6317824818493955e-05, + "loss": 2.6951, + "step": 6501000 + }, + { + "epoch": 2.0210859431708497, + "grad_norm": 12.514302253723145, + "learning_rate": 1.6315234280485842e-05, + "loss": 2.6847, + "step": 6501500 + }, + { + "epoch": 2.0212413754513365, + "grad_norm": 20.29636573791504, + "learning_rate": 1.6312643742477726e-05, + "loss": 2.6907, + "step": 6502000 + }, + { + "epoch": 2.0213968077318234, + "grad_norm": 73.62750244140625, + "learning_rate": 1.631005320446961e-05, + "loss": 2.6919, + "step": 6502500 + }, + { + "epoch": 2.0215522400123103, + "grad_norm": 18.660926818847656, + "learning_rate": 1.6307462666461497e-05, + "loss": 2.6471, + "step": 6503000 + }, + { + "epoch": 2.021707672292797, + "grad_norm": 9.835565567016602, + "learning_rate": 1.6304872128453384e-05, + "loss": 2.7029, + "step": 6503500 + }, + { + "epoch": 2.021863104573284, + "grad_norm": 8.378204345703125, + "learning_rate": 1.6302281590445268e-05, + "loss": 2.6929, + "step": 6504000 + }, + { + "epoch": 2.022018536853771, + "grad_norm": 10.506912231445312, + "learning_rate": 1.629969105243715e-05, + "loss": 2.707, + "step": 6504500 + }, + { + "epoch": 2.0221739691342577, + "grad_norm": 9.00632381439209, + "learning_rate": 1.629710051442904e-05, + "loss": 2.6983, + "step": 6505000 + }, + { + "epoch": 2.0223294014147446, + "grad_norm": 8.948844909667969, + "learning_rate": 1.6294509976420926e-05, + "loss": 2.6557, + "step": 6505500 + }, + { + "epoch": 2.0224848336952315, + "grad_norm": 8.801712989807129, + "learning_rate": 1.629191943841281e-05, + "loss": 2.6953, + "step": 6506000 + }, + { + "epoch": 2.0226402659757183, + "grad_norm": 11.228815078735352, + "learning_rate": 1.6289328900404693e-05, + "loss": 2.7073, + "step": 6506500 + }, + { + "epoch": 2.022795698256205, + "grad_norm": 8.246481895446777, + "learning_rate": 1.628673836239658e-05, + "loss": 2.702, + "step": 6507000 + }, + { + "epoch": 2.022951130536692, + "grad_norm": 8.658482551574707, + "learning_rate": 1.6284147824388464e-05, + "loss": 2.7324, + "step": 6507500 + }, + { + "epoch": 2.023106562817179, + "grad_norm": 17.942352294921875, + "learning_rate": 1.628155728638035e-05, + "loss": 2.6753, + "step": 6508000 + }, + { + "epoch": 2.023261995097666, + "grad_norm": 11.20892333984375, + "learning_rate": 1.627896674837224e-05, + "loss": 2.7414, + "step": 6508500 + }, + { + "epoch": 2.0234174273781527, + "grad_norm": 8.72672176361084, + "learning_rate": 1.6276376210364122e-05, + "loss": 2.7463, + "step": 6509000 + }, + { + "epoch": 2.0235728596586395, + "grad_norm": 8.996535301208496, + "learning_rate": 1.6273785672356006e-05, + "loss": 2.7306, + "step": 6509500 + }, + { + "epoch": 2.0237282919391264, + "grad_norm": 8.152509689331055, + "learning_rate": 1.627119513434789e-05, + "loss": 2.7175, + "step": 6510000 + }, + { + "epoch": 2.0238837242196133, + "grad_norm": 9.312874794006348, + "learning_rate": 1.626860459633978e-05, + "loss": 2.6719, + "step": 6510500 + }, + { + "epoch": 2.0240391565001, + "grad_norm": 10.23707389831543, + "learning_rate": 1.6266014058331664e-05, + "loss": 2.6914, + "step": 6511000 + }, + { + "epoch": 2.024194588780587, + "grad_norm": 16.89018440246582, + "learning_rate": 1.6263423520323548e-05, + "loss": 2.7091, + "step": 6511500 + }, + { + "epoch": 2.024350021061074, + "grad_norm": 55.24746322631836, + "learning_rate": 1.6260832982315435e-05, + "loss": 2.7407, + "step": 6512000 + }, + { + "epoch": 2.0245054533415607, + "grad_norm": 7.9032368659973145, + "learning_rate": 1.625824244430732e-05, + "loss": 2.6889, + "step": 6512500 + }, + { + "epoch": 2.0246608856220476, + "grad_norm": 14.365052223205566, + "learning_rate": 1.6255651906299206e-05, + "loss": 2.6479, + "step": 6513000 + }, + { + "epoch": 2.0248163179025345, + "grad_norm": 17.851566314697266, + "learning_rate": 1.625306136829109e-05, + "loss": 2.6711, + "step": 6513500 + }, + { + "epoch": 2.0249717501830213, + "grad_norm": 9.518136978149414, + "learning_rate": 1.6250470830282977e-05, + "loss": 2.6949, + "step": 6514000 + }, + { + "epoch": 2.025127182463508, + "grad_norm": 10.040191650390625, + "learning_rate": 1.624788029227486e-05, + "loss": 2.646, + "step": 6514500 + }, + { + "epoch": 2.025282614743995, + "grad_norm": 10.540205955505371, + "learning_rate": 1.6245289754266744e-05, + "loss": 2.6996, + "step": 6515000 + }, + { + "epoch": 2.025438047024482, + "grad_norm": 9.443804740905762, + "learning_rate": 1.624269921625863e-05, + "loss": 2.6446, + "step": 6515500 + }, + { + "epoch": 2.025593479304969, + "grad_norm": 8.86300277709961, + "learning_rate": 1.624010867825052e-05, + "loss": 2.697, + "step": 6516000 + }, + { + "epoch": 2.025748911585456, + "grad_norm": 9.125519752502441, + "learning_rate": 1.6237518140242402e-05, + "loss": 2.6824, + "step": 6516500 + }, + { + "epoch": 2.025904343865943, + "grad_norm": 10.833839416503906, + "learning_rate": 1.6234927602234286e-05, + "loss": 2.7146, + "step": 6517000 + }, + { + "epoch": 2.02605977614643, + "grad_norm": 12.79865550994873, + "learning_rate": 1.6232337064226173e-05, + "loss": 2.6292, + "step": 6517500 + }, + { + "epoch": 2.0262152084269167, + "grad_norm": 8.964180946350098, + "learning_rate": 1.622974652621806e-05, + "loss": 2.6768, + "step": 6518000 + }, + { + "epoch": 2.0263706407074036, + "grad_norm": 37.824283599853516, + "learning_rate": 1.6227155988209944e-05, + "loss": 2.6991, + "step": 6518500 + }, + { + "epoch": 2.0265260729878904, + "grad_norm": 9.638860702514648, + "learning_rate": 1.6224565450201828e-05, + "loss": 2.6429, + "step": 6519000 + }, + { + "epoch": 2.0266815052683773, + "grad_norm": 10.238956451416016, + "learning_rate": 1.6221974912193715e-05, + "loss": 2.6887, + "step": 6519500 + }, + { + "epoch": 2.026836937548864, + "grad_norm": 10.441139221191406, + "learning_rate": 1.62193843741856e-05, + "loss": 2.7587, + "step": 6520000 + }, + { + "epoch": 2.026992369829351, + "grad_norm": 14.683156967163086, + "learning_rate": 1.6216793836177486e-05, + "loss": 2.7309, + "step": 6520500 + }, + { + "epoch": 2.027147802109838, + "grad_norm": 6.23175048828125, + "learning_rate": 1.6214203298169373e-05, + "loss": 2.6968, + "step": 6521000 + }, + { + "epoch": 2.027303234390325, + "grad_norm": 10.746976852416992, + "learning_rate": 1.6211612760161257e-05, + "loss": 2.7189, + "step": 6521500 + }, + { + "epoch": 2.0274586666708116, + "grad_norm": 11.928051948547363, + "learning_rate": 1.620902222215314e-05, + "loss": 2.769, + "step": 6522000 + }, + { + "epoch": 2.0276140989512985, + "grad_norm": 9.95845890045166, + "learning_rate": 1.6206431684145028e-05, + "loss": 2.6461, + "step": 6522500 + }, + { + "epoch": 2.0277695312317854, + "grad_norm": 9.106955528259277, + "learning_rate": 1.6203841146136915e-05, + "loss": 2.6678, + "step": 6523000 + }, + { + "epoch": 2.0279249635122722, + "grad_norm": 12.06605052947998, + "learning_rate": 1.62012506081288e-05, + "loss": 2.6842, + "step": 6523500 + }, + { + "epoch": 2.028080395792759, + "grad_norm": 13.183023452758789, + "learning_rate": 1.6198660070120683e-05, + "loss": 2.6478, + "step": 6524000 + }, + { + "epoch": 2.028235828073246, + "grad_norm": 12.39346981048584, + "learning_rate": 1.6196069532112566e-05, + "loss": 2.6429, + "step": 6524500 + }, + { + "epoch": 2.028391260353733, + "grad_norm": 12.75143814086914, + "learning_rate": 1.6193478994104453e-05, + "loss": 2.6864, + "step": 6525000 + }, + { + "epoch": 2.0285466926342197, + "grad_norm": 10.204999923706055, + "learning_rate": 1.619088845609634e-05, + "loss": 2.7138, + "step": 6525500 + }, + { + "epoch": 2.0287021249147066, + "grad_norm": 11.501352310180664, + "learning_rate": 1.6188297918088224e-05, + "loss": 2.682, + "step": 6526000 + }, + { + "epoch": 2.0288575571951935, + "grad_norm": 8.485295295715332, + "learning_rate": 1.618570738008011e-05, + "loss": 2.6759, + "step": 6526500 + }, + { + "epoch": 2.0290129894756803, + "grad_norm": 7.998231410980225, + "learning_rate": 1.6183116842071995e-05, + "loss": 2.6381, + "step": 6527000 + }, + { + "epoch": 2.029168421756167, + "grad_norm": 9.620477676391602, + "learning_rate": 1.6180526304063882e-05, + "loss": 2.6862, + "step": 6527500 + }, + { + "epoch": 2.029323854036654, + "grad_norm": 13.070785522460938, + "learning_rate": 1.6177935766055766e-05, + "loss": 2.6701, + "step": 6528000 + }, + { + "epoch": 2.029479286317141, + "grad_norm": 9.424803733825684, + "learning_rate": 1.6175345228047653e-05, + "loss": 2.6286, + "step": 6528500 + }, + { + "epoch": 2.029634718597628, + "grad_norm": 8.662205696105957, + "learning_rate": 1.6172754690039537e-05, + "loss": 2.6778, + "step": 6529000 + }, + { + "epoch": 2.0297901508781147, + "grad_norm": 10.490798950195312, + "learning_rate": 1.617016415203142e-05, + "loss": 2.699, + "step": 6529500 + }, + { + "epoch": 2.0299455831586015, + "grad_norm": 9.79858112335205, + "learning_rate": 1.616757361402331e-05, + "loss": 2.7136, + "step": 6530000 + }, + { + "epoch": 2.0301010154390884, + "grad_norm": 9.5396146774292, + "learning_rate": 1.6164983076015195e-05, + "loss": 2.6319, + "step": 6530500 + }, + { + "epoch": 2.0302564477195753, + "grad_norm": 9.63933277130127, + "learning_rate": 1.616239253800708e-05, + "loss": 2.6651, + "step": 6531000 + }, + { + "epoch": 2.030411880000062, + "grad_norm": 8.992816925048828, + "learning_rate": 1.6159801999998963e-05, + "loss": 2.6683, + "step": 6531500 + }, + { + "epoch": 2.030567312280549, + "grad_norm": 9.834625244140625, + "learning_rate": 1.615721146199085e-05, + "loss": 2.6746, + "step": 6532000 + }, + { + "epoch": 2.030722744561036, + "grad_norm": 12.741026878356934, + "learning_rate": 1.6154620923982737e-05, + "loss": 2.7688, + "step": 6532500 + }, + { + "epoch": 2.0308781768415227, + "grad_norm": 9.115029335021973, + "learning_rate": 1.615203038597462e-05, + "loss": 2.6962, + "step": 6533000 + }, + { + "epoch": 2.0310336091220096, + "grad_norm": 13.792146682739258, + "learning_rate": 1.6149439847966505e-05, + "loss": 2.6703, + "step": 6533500 + }, + { + "epoch": 2.0311890414024965, + "grad_norm": 7.774941444396973, + "learning_rate": 1.6146849309958392e-05, + "loss": 2.6598, + "step": 6534000 + }, + { + "epoch": 2.0313444736829833, + "grad_norm": 19.328590393066406, + "learning_rate": 1.6144258771950275e-05, + "loss": 2.6226, + "step": 6534500 + }, + { + "epoch": 2.03149990596347, + "grad_norm": 9.992227554321289, + "learning_rate": 1.6141668233942163e-05, + "loss": 2.7158, + "step": 6535000 + }, + { + "epoch": 2.031655338243957, + "grad_norm": 7.473269462585449, + "learning_rate": 1.613907769593405e-05, + "loss": 2.7128, + "step": 6535500 + }, + { + "epoch": 2.031810770524444, + "grad_norm": 13.203424453735352, + "learning_rate": 1.6136487157925934e-05, + "loss": 2.7382, + "step": 6536000 + }, + { + "epoch": 2.031966202804931, + "grad_norm": 11.541528701782227, + "learning_rate": 1.6133896619917817e-05, + "loss": 2.6773, + "step": 6536500 + }, + { + "epoch": 2.0321216350854177, + "grad_norm": 9.468109130859375, + "learning_rate": 1.6131306081909704e-05, + "loss": 2.7276, + "step": 6537000 + }, + { + "epoch": 2.0322770673659045, + "grad_norm": 11.247535705566406, + "learning_rate": 1.612871554390159e-05, + "loss": 2.716, + "step": 6537500 + }, + { + "epoch": 2.0324324996463914, + "grad_norm": 10.290995597839355, + "learning_rate": 1.6126125005893475e-05, + "loss": 2.664, + "step": 6538000 + }, + { + "epoch": 2.0325879319268783, + "grad_norm": 9.800825119018555, + "learning_rate": 1.612353446788536e-05, + "loss": 2.67, + "step": 6538500 + }, + { + "epoch": 2.032743364207365, + "grad_norm": 9.331636428833008, + "learning_rate": 1.6120943929877246e-05, + "loss": 2.7001, + "step": 6539000 + }, + { + "epoch": 2.032898796487852, + "grad_norm": 13.785004615783691, + "learning_rate": 1.611835339186913e-05, + "loss": 2.6759, + "step": 6539500 + }, + { + "epoch": 2.033054228768339, + "grad_norm": 10.606528282165527, + "learning_rate": 1.6115762853861017e-05, + "loss": 2.666, + "step": 6540000 + }, + { + "epoch": 2.0332096610488257, + "grad_norm": 9.927135467529297, + "learning_rate": 1.61131723158529e-05, + "loss": 2.6918, + "step": 6540500 + }, + { + "epoch": 2.033365093329313, + "grad_norm": 8.430176734924316, + "learning_rate": 1.6110581777844788e-05, + "loss": 2.7272, + "step": 6541000 + }, + { + "epoch": 2.0335205256098, + "grad_norm": 8.321571350097656, + "learning_rate": 1.6107991239836672e-05, + "loss": 2.6807, + "step": 6541500 + }, + { + "epoch": 2.0336759578902868, + "grad_norm": 9.53666877746582, + "learning_rate": 1.610540070182856e-05, + "loss": 2.6376, + "step": 6542000 + }, + { + "epoch": 2.0338313901707736, + "grad_norm": 11.349222183227539, + "learning_rate": 1.6102810163820443e-05, + "loss": 2.6952, + "step": 6542500 + }, + { + "epoch": 2.0339868224512605, + "grad_norm": 7.05241060256958, + "learning_rate": 1.610021962581233e-05, + "loss": 2.6894, + "step": 6543000 + }, + { + "epoch": 2.0341422547317474, + "grad_norm": 8.506032943725586, + "learning_rate": 1.6097629087804214e-05, + "loss": 2.6668, + "step": 6543500 + }, + { + "epoch": 2.0342976870122342, + "grad_norm": 8.735908508300781, + "learning_rate": 1.6095038549796097e-05, + "loss": 2.667, + "step": 6544000 + }, + { + "epoch": 2.034453119292721, + "grad_norm": 32.87714767456055, + "learning_rate": 1.6092448011787985e-05, + "loss": 2.6952, + "step": 6544500 + }, + { + "epoch": 2.034608551573208, + "grad_norm": 9.226994514465332, + "learning_rate": 1.6089857473779872e-05, + "loss": 2.6686, + "step": 6545000 + }, + { + "epoch": 2.034763983853695, + "grad_norm": 10.593411445617676, + "learning_rate": 1.6087266935771756e-05, + "loss": 2.6984, + "step": 6545500 + }, + { + "epoch": 2.0349194161341817, + "grad_norm": 7.895347595214844, + "learning_rate": 1.608467639776364e-05, + "loss": 2.6568, + "step": 6546000 + }, + { + "epoch": 2.0350748484146686, + "grad_norm": 17.56942367553711, + "learning_rate": 1.6082085859755526e-05, + "loss": 2.7017, + "step": 6546500 + }, + { + "epoch": 2.0352302806951554, + "grad_norm": 10.369783401489258, + "learning_rate": 1.6079495321747414e-05, + "loss": 2.6981, + "step": 6547000 + }, + { + "epoch": 2.0353857129756423, + "grad_norm": 10.478930473327637, + "learning_rate": 1.6076904783739297e-05, + "loss": 2.6963, + "step": 6547500 + }, + { + "epoch": 2.035541145256129, + "grad_norm": 13.021665573120117, + "learning_rate": 1.6074314245731185e-05, + "loss": 2.6468, + "step": 6548000 + }, + { + "epoch": 2.035696577536616, + "grad_norm": 9.192367553710938, + "learning_rate": 1.607172370772307e-05, + "loss": 2.6837, + "step": 6548500 + }, + { + "epoch": 2.035852009817103, + "grad_norm": 12.029928207397461, + "learning_rate": 1.6069133169714952e-05, + "loss": 2.683, + "step": 6549000 + }, + { + "epoch": 2.0360074420975898, + "grad_norm": 11.386079788208008, + "learning_rate": 1.606654263170684e-05, + "loss": 2.68, + "step": 6549500 + }, + { + "epoch": 2.0361628743780766, + "grad_norm": 9.36013126373291, + "learning_rate": 1.6063952093698726e-05, + "loss": 2.6723, + "step": 6550000 + }, + { + "epoch": 2.0363183066585635, + "grad_norm": 29.017248153686523, + "learning_rate": 1.606136155569061e-05, + "loss": 2.677, + "step": 6550500 + }, + { + "epoch": 2.0364737389390504, + "grad_norm": 9.445663452148438, + "learning_rate": 1.6058771017682494e-05, + "loss": 2.7137, + "step": 6551000 + }, + { + "epoch": 2.0366291712195372, + "grad_norm": 10.953438758850098, + "learning_rate": 1.6056180479674378e-05, + "loss": 2.6894, + "step": 6551500 + }, + { + "epoch": 2.036784603500024, + "grad_norm": 6.992936134338379, + "learning_rate": 1.6053589941666268e-05, + "loss": 2.7202, + "step": 6552000 + }, + { + "epoch": 2.036940035780511, + "grad_norm": 9.291109085083008, + "learning_rate": 1.6050999403658152e-05, + "loss": 2.6855, + "step": 6552500 + }, + { + "epoch": 2.037095468060998, + "grad_norm": 8.200841903686523, + "learning_rate": 1.6048408865650036e-05, + "loss": 2.715, + "step": 6553000 + }, + { + "epoch": 2.0372509003414847, + "grad_norm": 9.346052169799805, + "learning_rate": 1.6045818327641923e-05, + "loss": 2.7249, + "step": 6553500 + }, + { + "epoch": 2.0374063326219716, + "grad_norm": 10.995390892028809, + "learning_rate": 1.6043227789633807e-05, + "loss": 2.6874, + "step": 6554000 + }, + { + "epoch": 2.0375617649024584, + "grad_norm": 9.613551139831543, + "learning_rate": 1.6040637251625694e-05, + "loss": 2.6526, + "step": 6554500 + }, + { + "epoch": 2.0377171971829453, + "grad_norm": 21.474559783935547, + "learning_rate": 1.6038046713617578e-05, + "loss": 2.7084, + "step": 6555000 + }, + { + "epoch": 2.037872629463432, + "grad_norm": 11.047367095947266, + "learning_rate": 1.6035456175609465e-05, + "loss": 2.6826, + "step": 6555500 + }, + { + "epoch": 2.038028061743919, + "grad_norm": 9.48479175567627, + "learning_rate": 1.603286563760135e-05, + "loss": 2.7024, + "step": 6556000 + }, + { + "epoch": 2.038183494024406, + "grad_norm": 12.243215560913086, + "learning_rate": 1.6030275099593232e-05, + "loss": 2.6977, + "step": 6556500 + }, + { + "epoch": 2.0383389263048928, + "grad_norm": 19.143415451049805, + "learning_rate": 1.6027684561585123e-05, + "loss": 2.7327, + "step": 6557000 + }, + { + "epoch": 2.0384943585853796, + "grad_norm": 8.301162719726562, + "learning_rate": 1.6025094023577007e-05, + "loss": 2.6252, + "step": 6557500 + }, + { + "epoch": 2.0386497908658665, + "grad_norm": 8.415359497070312, + "learning_rate": 1.602250348556889e-05, + "loss": 2.7145, + "step": 6558000 + }, + { + "epoch": 2.0388052231463534, + "grad_norm": 10.639252662658691, + "learning_rate": 1.6019912947560774e-05, + "loss": 2.6631, + "step": 6558500 + }, + { + "epoch": 2.0389606554268402, + "grad_norm": 10.020273208618164, + "learning_rate": 1.601732240955266e-05, + "loss": 2.6835, + "step": 6559000 + }, + { + "epoch": 2.039116087707327, + "grad_norm": 11.02117919921875, + "learning_rate": 1.601473187154455e-05, + "loss": 2.6576, + "step": 6559500 + }, + { + "epoch": 2.039271519987814, + "grad_norm": 113.75675964355469, + "learning_rate": 1.6012141333536432e-05, + "loss": 2.6834, + "step": 6560000 + }, + { + "epoch": 2.039426952268301, + "grad_norm": 8.891725540161133, + "learning_rate": 1.6009550795528316e-05, + "loss": 2.6496, + "step": 6560500 + }, + { + "epoch": 2.0395823845487877, + "grad_norm": 10.181907653808594, + "learning_rate": 1.6006960257520203e-05, + "loss": 2.6584, + "step": 6561000 + }, + { + "epoch": 2.0397378168292746, + "grad_norm": 17.721616744995117, + "learning_rate": 1.6004369719512087e-05, + "loss": 2.6378, + "step": 6561500 + }, + { + "epoch": 2.0398932491097614, + "grad_norm": 11.482336044311523, + "learning_rate": 1.6001779181503974e-05, + "loss": 2.7148, + "step": 6562000 + }, + { + "epoch": 2.0400486813902483, + "grad_norm": 15.764663696289062, + "learning_rate": 1.599918864349586e-05, + "loss": 2.7218, + "step": 6562500 + }, + { + "epoch": 2.040204113670735, + "grad_norm": 11.922788619995117, + "learning_rate": 1.5996598105487745e-05, + "loss": 2.6888, + "step": 6563000 + }, + { + "epoch": 2.040359545951222, + "grad_norm": 22.635316848754883, + "learning_rate": 1.599400756747963e-05, + "loss": 2.6874, + "step": 6563500 + }, + { + "epoch": 2.040514978231709, + "grad_norm": 9.791698455810547, + "learning_rate": 1.5991417029471516e-05, + "loss": 2.691, + "step": 6564000 + }, + { + "epoch": 2.0406704105121958, + "grad_norm": 10.226073265075684, + "learning_rate": 1.5988826491463403e-05, + "loss": 2.6649, + "step": 6564500 + }, + { + "epoch": 2.040825842792683, + "grad_norm": 9.711691856384277, + "learning_rate": 1.5986235953455287e-05, + "loss": 2.7174, + "step": 6565000 + }, + { + "epoch": 2.04098127507317, + "grad_norm": 11.925829887390137, + "learning_rate": 1.598364541544717e-05, + "loss": 2.6723, + "step": 6565500 + }, + { + "epoch": 2.041136707353657, + "grad_norm": 9.200526237487793, + "learning_rate": 1.5981054877439058e-05, + "loss": 2.6658, + "step": 6566000 + }, + { + "epoch": 2.0412921396341437, + "grad_norm": 11.923274993896484, + "learning_rate": 1.597846433943094e-05, + "loss": 2.674, + "step": 6566500 + }, + { + "epoch": 2.0414475719146306, + "grad_norm": 14.407397270202637, + "learning_rate": 1.597587380142283e-05, + "loss": 2.6683, + "step": 6567000 + }, + { + "epoch": 2.0416030041951174, + "grad_norm": 14.948527336120605, + "learning_rate": 1.5973283263414712e-05, + "loss": 2.6544, + "step": 6567500 + }, + { + "epoch": 2.0417584364756043, + "grad_norm": 9.630783081054688, + "learning_rate": 1.59706927254066e-05, + "loss": 2.7685, + "step": 6568000 + }, + { + "epoch": 2.041913868756091, + "grad_norm": 8.849905967712402, + "learning_rate": 1.5968102187398483e-05, + "loss": 2.6641, + "step": 6568500 + }, + { + "epoch": 2.042069301036578, + "grad_norm": 9.129586219787598, + "learning_rate": 1.596551164939037e-05, + "loss": 2.6476, + "step": 6569000 + }, + { + "epoch": 2.042224733317065, + "grad_norm": 9.812392234802246, + "learning_rate": 1.5962921111382254e-05, + "loss": 2.7004, + "step": 6569500 + }, + { + "epoch": 2.0423801655975518, + "grad_norm": 21.0211238861084, + "learning_rate": 1.596033057337414e-05, + "loss": 2.6447, + "step": 6570000 + }, + { + "epoch": 2.0425355978780386, + "grad_norm": 7.453287601470947, + "learning_rate": 1.5957740035366025e-05, + "loss": 2.7269, + "step": 6570500 + }, + { + "epoch": 2.0426910301585255, + "grad_norm": 9.640110969543457, + "learning_rate": 1.595514949735791e-05, + "loss": 2.6737, + "step": 6571000 + }, + { + "epoch": 2.0428464624390124, + "grad_norm": 13.18388557434082, + "learning_rate": 1.5952558959349796e-05, + "loss": 2.7347, + "step": 6571500 + }, + { + "epoch": 2.0430018947194992, + "grad_norm": 27.08013153076172, + "learning_rate": 1.5949968421341683e-05, + "loss": 2.6495, + "step": 6572000 + }, + { + "epoch": 2.043157326999986, + "grad_norm": 9.364423751831055, + "learning_rate": 1.5947377883333567e-05, + "loss": 2.6597, + "step": 6572500 + }, + { + "epoch": 2.043312759280473, + "grad_norm": 9.508058547973633, + "learning_rate": 1.594478734532545e-05, + "loss": 2.6941, + "step": 6573000 + }, + { + "epoch": 2.04346819156096, + "grad_norm": 30.818681716918945, + "learning_rate": 1.5942196807317338e-05, + "loss": 2.7222, + "step": 6573500 + }, + { + "epoch": 2.0436236238414467, + "grad_norm": 16.997400283813477, + "learning_rate": 1.5939606269309225e-05, + "loss": 2.6717, + "step": 6574000 + }, + { + "epoch": 2.0437790561219336, + "grad_norm": 9.488465309143066, + "learning_rate": 1.593701573130111e-05, + "loss": 2.7562, + "step": 6574500 + }, + { + "epoch": 2.0439344884024204, + "grad_norm": 7.809385776519775, + "learning_rate": 1.5934425193292996e-05, + "loss": 2.6903, + "step": 6575000 + }, + { + "epoch": 2.0440899206829073, + "grad_norm": 8.929349899291992, + "learning_rate": 1.593183465528488e-05, + "loss": 2.6727, + "step": 6575500 + }, + { + "epoch": 2.044245352963394, + "grad_norm": 11.648028373718262, + "learning_rate": 1.5929244117276764e-05, + "loss": 2.6957, + "step": 6576000 + }, + { + "epoch": 2.044400785243881, + "grad_norm": 8.880167961120605, + "learning_rate": 1.592665357926865e-05, + "loss": 2.6327, + "step": 6576500 + }, + { + "epoch": 2.044556217524368, + "grad_norm": 10.980363845825195, + "learning_rate": 1.5924063041260538e-05, + "loss": 2.6997, + "step": 6577000 + }, + { + "epoch": 2.0447116498048548, + "grad_norm": 10.958874702453613, + "learning_rate": 1.592147250325242e-05, + "loss": 2.7641, + "step": 6577500 + }, + { + "epoch": 2.0448670820853416, + "grad_norm": 13.971793174743652, + "learning_rate": 1.5918881965244305e-05, + "loss": 2.724, + "step": 6578000 + }, + { + "epoch": 2.0450225143658285, + "grad_norm": 12.799453735351562, + "learning_rate": 1.591629142723619e-05, + "loss": 2.6523, + "step": 6578500 + }, + { + "epoch": 2.0451779466463154, + "grad_norm": 9.793581008911133, + "learning_rate": 1.591370088922808e-05, + "loss": 2.7148, + "step": 6579000 + }, + { + "epoch": 2.0453333789268022, + "grad_norm": 9.775115013122559, + "learning_rate": 1.5911110351219963e-05, + "loss": 2.7352, + "step": 6579500 + }, + { + "epoch": 2.045488811207289, + "grad_norm": 38.40021514892578, + "learning_rate": 1.5908519813211847e-05, + "loss": 2.6926, + "step": 6580000 + }, + { + "epoch": 2.045644243487776, + "grad_norm": 12.642677307128906, + "learning_rate": 1.5905929275203734e-05, + "loss": 2.6677, + "step": 6580500 + }, + { + "epoch": 2.045799675768263, + "grad_norm": 11.285929679870605, + "learning_rate": 1.5903338737195618e-05, + "loss": 2.6906, + "step": 6581000 + }, + { + "epoch": 2.0459551080487497, + "grad_norm": 6.8810811042785645, + "learning_rate": 1.5900748199187505e-05, + "loss": 2.6386, + "step": 6581500 + }, + { + "epoch": 2.0461105403292366, + "grad_norm": 19.69621467590332, + "learning_rate": 1.589815766117939e-05, + "loss": 2.6604, + "step": 6582000 + }, + { + "epoch": 2.0462659726097234, + "grad_norm": 9.069329261779785, + "learning_rate": 1.5895567123171276e-05, + "loss": 2.7228, + "step": 6582500 + }, + { + "epoch": 2.0464214048902103, + "grad_norm": 5.248758792877197, + "learning_rate": 1.589297658516316e-05, + "loss": 2.6698, + "step": 6583000 + }, + { + "epoch": 2.046576837170697, + "grad_norm": 9.865951538085938, + "learning_rate": 1.5890386047155044e-05, + "loss": 2.6959, + "step": 6583500 + }, + { + "epoch": 2.046732269451184, + "grad_norm": 10.345258712768555, + "learning_rate": 1.5887795509146934e-05, + "loss": 2.6661, + "step": 6584000 + }, + { + "epoch": 2.046887701731671, + "grad_norm": 11.589953422546387, + "learning_rate": 1.5885204971138818e-05, + "loss": 2.7523, + "step": 6584500 + }, + { + "epoch": 2.0470431340121578, + "grad_norm": 12.072528839111328, + "learning_rate": 1.5882614433130702e-05, + "loss": 2.709, + "step": 6585000 + }, + { + "epoch": 2.0471985662926446, + "grad_norm": 11.064383506774902, + "learning_rate": 1.5880023895122586e-05, + "loss": 2.6942, + "step": 6585500 + }, + { + "epoch": 2.0473539985731315, + "grad_norm": 9.99407958984375, + "learning_rate": 1.5877433357114473e-05, + "loss": 2.6867, + "step": 6586000 + }, + { + "epoch": 2.0475094308536184, + "grad_norm": 9.810612678527832, + "learning_rate": 1.587484281910636e-05, + "loss": 2.7035, + "step": 6586500 + }, + { + "epoch": 2.0476648631341052, + "grad_norm": 8.36544418334961, + "learning_rate": 1.5872252281098244e-05, + "loss": 2.6593, + "step": 6587000 + }, + { + "epoch": 2.047820295414592, + "grad_norm": 10.495609283447266, + "learning_rate": 1.5869661743090127e-05, + "loss": 2.6821, + "step": 6587500 + }, + { + "epoch": 2.047975727695079, + "grad_norm": 12.585498809814453, + "learning_rate": 1.5867071205082015e-05, + "loss": 2.6949, + "step": 6588000 + }, + { + "epoch": 2.048131159975566, + "grad_norm": 9.732745170593262, + "learning_rate": 1.5864480667073898e-05, + "loss": 2.6872, + "step": 6588500 + }, + { + "epoch": 2.048286592256053, + "grad_norm": 13.961509704589844, + "learning_rate": 1.5861890129065785e-05, + "loss": 2.6698, + "step": 6589000 + }, + { + "epoch": 2.04844202453654, + "grad_norm": 10.1156005859375, + "learning_rate": 1.5859299591057673e-05, + "loss": 2.6889, + "step": 6589500 + }, + { + "epoch": 2.048597456817027, + "grad_norm": 9.00040054321289, + "learning_rate": 1.5856709053049556e-05, + "loss": 2.6878, + "step": 6590000 + }, + { + "epoch": 2.0487528890975137, + "grad_norm": 10.296149253845215, + "learning_rate": 1.585411851504144e-05, + "loss": 2.6987, + "step": 6590500 + }, + { + "epoch": 2.0489083213780006, + "grad_norm": 11.685768127441406, + "learning_rate": 1.5851527977033327e-05, + "loss": 2.6657, + "step": 6591000 + }, + { + "epoch": 2.0490637536584875, + "grad_norm": 13.859115600585938, + "learning_rate": 1.5848937439025214e-05, + "loss": 2.6534, + "step": 6591500 + }, + { + "epoch": 2.0492191859389743, + "grad_norm": 9.628662109375, + "learning_rate": 1.5846346901017098e-05, + "loss": 2.6382, + "step": 6592000 + }, + { + "epoch": 2.049374618219461, + "grad_norm": 28.43555450439453, + "learning_rate": 1.5843756363008982e-05, + "loss": 2.7155, + "step": 6592500 + }, + { + "epoch": 2.049530050499948, + "grad_norm": 14.447835922241211, + "learning_rate": 1.584116582500087e-05, + "loss": 2.7041, + "step": 6593000 + }, + { + "epoch": 2.049685482780435, + "grad_norm": 11.183114051818848, + "learning_rate": 1.5838575286992753e-05, + "loss": 2.7129, + "step": 6593500 + }, + { + "epoch": 2.049840915060922, + "grad_norm": 9.649235725402832, + "learning_rate": 1.583598474898464e-05, + "loss": 2.6953, + "step": 6594000 + }, + { + "epoch": 2.0499963473414087, + "grad_norm": 9.066604614257812, + "learning_rate": 1.5833394210976524e-05, + "loss": 2.6895, + "step": 6594500 + }, + { + "epoch": 2.0501517796218955, + "grad_norm": 39.10525131225586, + "learning_rate": 1.583080367296841e-05, + "loss": 2.6797, + "step": 6595000 + }, + { + "epoch": 2.0503072119023824, + "grad_norm": 42.96348571777344, + "learning_rate": 1.5828213134960295e-05, + "loss": 2.7003, + "step": 6595500 + }, + { + "epoch": 2.0504626441828693, + "grad_norm": 13.54821491241455, + "learning_rate": 1.5825622596952182e-05, + "loss": 2.6735, + "step": 6596000 + }, + { + "epoch": 2.050618076463356, + "grad_norm": 10.939285278320312, + "learning_rate": 1.5823032058944066e-05, + "loss": 2.6749, + "step": 6596500 + }, + { + "epoch": 2.050773508743843, + "grad_norm": 32.988765716552734, + "learning_rate": 1.5820441520935953e-05, + "loss": 2.7179, + "step": 6597000 + }, + { + "epoch": 2.05092894102433, + "grad_norm": 9.141331672668457, + "learning_rate": 1.5817850982927837e-05, + "loss": 2.7154, + "step": 6597500 + }, + { + "epoch": 2.0510843733048167, + "grad_norm": 8.782336235046387, + "learning_rate": 1.581526044491972e-05, + "loss": 2.7012, + "step": 6598000 + }, + { + "epoch": 2.0512398055853036, + "grad_norm": 8.556257247924805, + "learning_rate": 1.5812669906911607e-05, + "loss": 2.6586, + "step": 6598500 + }, + { + "epoch": 2.0513952378657905, + "grad_norm": 9.952863693237305, + "learning_rate": 1.5810079368903495e-05, + "loss": 2.6875, + "step": 6599000 + }, + { + "epoch": 2.0515506701462773, + "grad_norm": 12.721390724182129, + "learning_rate": 1.580748883089538e-05, + "loss": 2.6922, + "step": 6599500 + }, + { + "epoch": 2.051706102426764, + "grad_norm": 6.397401809692383, + "learning_rate": 1.5804898292887262e-05, + "loss": 2.6806, + "step": 6600000 + }, + { + "epoch": 2.051861534707251, + "grad_norm": 9.178874969482422, + "learning_rate": 1.580230775487915e-05, + "loss": 2.6773, + "step": 6600500 + }, + { + "epoch": 2.052016966987738, + "grad_norm": 9.475863456726074, + "learning_rate": 1.5799717216871036e-05, + "loss": 2.6673, + "step": 6601000 + }, + { + "epoch": 2.052172399268225, + "grad_norm": 7.7190022468566895, + "learning_rate": 1.579712667886292e-05, + "loss": 2.694, + "step": 6601500 + }, + { + "epoch": 2.0523278315487117, + "grad_norm": 11.483798027038574, + "learning_rate": 1.5794536140854807e-05, + "loss": 2.6754, + "step": 6602000 + }, + { + "epoch": 2.0524832638291985, + "grad_norm": 9.898622512817383, + "learning_rate": 1.579194560284669e-05, + "loss": 2.6825, + "step": 6602500 + }, + { + "epoch": 2.0526386961096854, + "grad_norm": 8.234662055969238, + "learning_rate": 1.5789355064838575e-05, + "loss": 2.6612, + "step": 6603000 + }, + { + "epoch": 2.0527941283901723, + "grad_norm": 10.813386917114258, + "learning_rate": 1.5786764526830462e-05, + "loss": 2.7, + "step": 6603500 + }, + { + "epoch": 2.052949560670659, + "grad_norm": 8.603727340698242, + "learning_rate": 1.578417398882235e-05, + "loss": 2.6815, + "step": 6604000 + }, + { + "epoch": 2.053104992951146, + "grad_norm": 10.935528755187988, + "learning_rate": 1.5781583450814233e-05, + "loss": 2.6704, + "step": 6604500 + }, + { + "epoch": 2.053260425231633, + "grad_norm": 13.520212173461914, + "learning_rate": 1.5778992912806117e-05, + "loss": 2.7109, + "step": 6605000 + }, + { + "epoch": 2.0534158575121197, + "grad_norm": 11.71785831451416, + "learning_rate": 1.5776402374798e-05, + "loss": 2.7325, + "step": 6605500 + }, + { + "epoch": 2.0535712897926066, + "grad_norm": 8.507654190063477, + "learning_rate": 1.577381183678989e-05, + "loss": 2.7277, + "step": 6606000 + }, + { + "epoch": 2.0537267220730935, + "grad_norm": 9.606528282165527, + "learning_rate": 1.5771221298781775e-05, + "loss": 2.6842, + "step": 6606500 + }, + { + "epoch": 2.0538821543535803, + "grad_norm": 19.80496597290039, + "learning_rate": 1.576863076077366e-05, + "loss": 2.6802, + "step": 6607000 + }, + { + "epoch": 2.054037586634067, + "grad_norm": 29.2476749420166, + "learning_rate": 1.5766040222765546e-05, + "loss": 2.6843, + "step": 6607500 + }, + { + "epoch": 2.054193018914554, + "grad_norm": 11.24028205871582, + "learning_rate": 1.576344968475743e-05, + "loss": 2.707, + "step": 6608000 + }, + { + "epoch": 2.054348451195041, + "grad_norm": 78.66790008544922, + "learning_rate": 1.5760859146749317e-05, + "loss": 2.7045, + "step": 6608500 + }, + { + "epoch": 2.054503883475528, + "grad_norm": 9.642828941345215, + "learning_rate": 1.57582686087412e-05, + "loss": 2.7076, + "step": 6609000 + }, + { + "epoch": 2.0546593157560147, + "grad_norm": 6.38845682144165, + "learning_rate": 1.5755678070733088e-05, + "loss": 2.6861, + "step": 6609500 + }, + { + "epoch": 2.0548147480365015, + "grad_norm": 8.32078742980957, + "learning_rate": 1.575308753272497e-05, + "loss": 2.6946, + "step": 6610000 + }, + { + "epoch": 2.0549701803169884, + "grad_norm": 8.869956970214844, + "learning_rate": 1.5750496994716855e-05, + "loss": 2.7054, + "step": 6610500 + }, + { + "epoch": 2.0551256125974753, + "grad_norm": 16.466785430908203, + "learning_rate": 1.5747906456708746e-05, + "loss": 2.6535, + "step": 6611000 + }, + { + "epoch": 2.055281044877962, + "grad_norm": 10.134562492370605, + "learning_rate": 1.574531591870063e-05, + "loss": 2.6871, + "step": 6611500 + }, + { + "epoch": 2.055436477158449, + "grad_norm": 12.541667938232422, + "learning_rate": 1.5742725380692513e-05, + "loss": 2.6754, + "step": 6612000 + }, + { + "epoch": 2.055591909438936, + "grad_norm": 9.011490821838379, + "learning_rate": 1.5740134842684397e-05, + "loss": 2.6924, + "step": 6612500 + }, + { + "epoch": 2.0557473417194227, + "grad_norm": 9.657326698303223, + "learning_rate": 1.5737544304676284e-05, + "loss": 2.6456, + "step": 6613000 + }, + { + "epoch": 2.05590277399991, + "grad_norm": 9.03519344329834, + "learning_rate": 1.573495376666817e-05, + "loss": 2.6933, + "step": 6613500 + }, + { + "epoch": 2.056058206280397, + "grad_norm": 8.693602561950684, + "learning_rate": 1.5732363228660055e-05, + "loss": 2.6553, + "step": 6614000 + }, + { + "epoch": 2.056213638560884, + "grad_norm": 8.621400833129883, + "learning_rate": 1.572977269065194e-05, + "loss": 2.6996, + "step": 6614500 + }, + { + "epoch": 2.0563690708413707, + "grad_norm": 12.017509460449219, + "learning_rate": 1.5727182152643826e-05, + "loss": 2.6469, + "step": 6615000 + }, + { + "epoch": 2.0565245031218575, + "grad_norm": 7.873671531677246, + "learning_rate": 1.572459161463571e-05, + "loss": 2.6972, + "step": 6615500 + }, + { + "epoch": 2.0566799354023444, + "grad_norm": 11.258150100708008, + "learning_rate": 1.5722001076627597e-05, + "loss": 2.6809, + "step": 6616000 + }, + { + "epoch": 2.0568353676828313, + "grad_norm": 13.301918983459473, + "learning_rate": 1.5719410538619484e-05, + "loss": 2.6585, + "step": 6616500 + }, + { + "epoch": 2.056990799963318, + "grad_norm": 7.379308700561523, + "learning_rate": 1.5716820000611368e-05, + "loss": 2.6513, + "step": 6617000 + }, + { + "epoch": 2.057146232243805, + "grad_norm": 8.53642463684082, + "learning_rate": 1.571422946260325e-05, + "loss": 2.674, + "step": 6617500 + }, + { + "epoch": 2.057301664524292, + "grad_norm": 8.889886856079102, + "learning_rate": 1.571163892459514e-05, + "loss": 2.7436, + "step": 6618000 + }, + { + "epoch": 2.0574570968047787, + "grad_norm": 15.929206848144531, + "learning_rate": 1.5709048386587026e-05, + "loss": 2.7457, + "step": 6618500 + }, + { + "epoch": 2.0576125290852656, + "grad_norm": 9.460229873657227, + "learning_rate": 1.570645784857891e-05, + "loss": 2.6766, + "step": 6619000 + }, + { + "epoch": 2.0577679613657525, + "grad_norm": 12.482217788696289, + "learning_rate": 1.5703867310570793e-05, + "loss": 2.6541, + "step": 6619500 + }, + { + "epoch": 2.0579233936462393, + "grad_norm": 10.421370506286621, + "learning_rate": 1.570127677256268e-05, + "loss": 2.6475, + "step": 6620000 + }, + { + "epoch": 2.058078825926726, + "grad_norm": 9.457893371582031, + "learning_rate": 1.5698686234554568e-05, + "loss": 2.6556, + "step": 6620500 + }, + { + "epoch": 2.058234258207213, + "grad_norm": 10.02214241027832, + "learning_rate": 1.569609569654645e-05, + "loss": 2.6794, + "step": 6621000 + }, + { + "epoch": 2.0583896904877, + "grad_norm": 8.7160005569458, + "learning_rate": 1.5693505158538335e-05, + "loss": 2.6385, + "step": 6621500 + }, + { + "epoch": 2.058545122768187, + "grad_norm": 9.744338035583496, + "learning_rate": 1.5690914620530222e-05, + "loss": 2.6744, + "step": 6622000 + }, + { + "epoch": 2.0587005550486737, + "grad_norm": 9.92478084564209, + "learning_rate": 1.5688324082522106e-05, + "loss": 2.6321, + "step": 6622500 + }, + { + "epoch": 2.0588559873291605, + "grad_norm": 8.90877628326416, + "learning_rate": 1.5685733544513993e-05, + "loss": 2.6685, + "step": 6623000 + }, + { + "epoch": 2.0590114196096474, + "grad_norm": 9.918718338012695, + "learning_rate": 1.5683143006505877e-05, + "loss": 2.6911, + "step": 6623500 + }, + { + "epoch": 2.0591668518901343, + "grad_norm": 12.028644561767578, + "learning_rate": 1.5680552468497764e-05, + "loss": 2.6624, + "step": 6624000 + }, + { + "epoch": 2.059322284170621, + "grad_norm": 9.889080047607422, + "learning_rate": 1.5677961930489648e-05, + "loss": 2.6469, + "step": 6624500 + }, + { + "epoch": 2.059477716451108, + "grad_norm": 9.131577491760254, + "learning_rate": 1.5675371392481532e-05, + "loss": 2.7099, + "step": 6625000 + }, + { + "epoch": 2.059633148731595, + "grad_norm": 10.743148803710938, + "learning_rate": 1.5672780854473422e-05, + "loss": 2.6857, + "step": 6625500 + }, + { + "epoch": 2.0597885810120817, + "grad_norm": 8.890665054321289, + "learning_rate": 1.5670190316465306e-05, + "loss": 2.7007, + "step": 6626000 + }, + { + "epoch": 2.0599440132925686, + "grad_norm": 9.685550689697266, + "learning_rate": 1.566759977845719e-05, + "loss": 2.6934, + "step": 6626500 + }, + { + "epoch": 2.0600994455730555, + "grad_norm": 11.233015060424805, + "learning_rate": 1.5665009240449074e-05, + "loss": 2.6804, + "step": 6627000 + }, + { + "epoch": 2.0602548778535423, + "grad_norm": 10.453324317932129, + "learning_rate": 1.566241870244096e-05, + "loss": 2.6741, + "step": 6627500 + }, + { + "epoch": 2.060410310134029, + "grad_norm": 8.787861824035645, + "learning_rate": 1.5659828164432848e-05, + "loss": 2.7106, + "step": 6628000 + }, + { + "epoch": 2.060565742414516, + "grad_norm": 8.800017356872559, + "learning_rate": 1.565723762642473e-05, + "loss": 2.6821, + "step": 6628500 + }, + { + "epoch": 2.060721174695003, + "grad_norm": 6.277029037475586, + "learning_rate": 1.565464708841662e-05, + "loss": 2.6998, + "step": 6629000 + }, + { + "epoch": 2.06087660697549, + "grad_norm": 8.517751693725586, + "learning_rate": 1.5652056550408503e-05, + "loss": 2.6925, + "step": 6629500 + }, + { + "epoch": 2.0610320392559767, + "grad_norm": 19.26609992980957, + "learning_rate": 1.5649466012400386e-05, + "loss": 2.6911, + "step": 6630000 + }, + { + "epoch": 2.0611874715364635, + "grad_norm": 21.904457092285156, + "learning_rate": 1.5646875474392273e-05, + "loss": 2.6761, + "step": 6630500 + }, + { + "epoch": 2.0613429038169504, + "grad_norm": 8.692089080810547, + "learning_rate": 1.564428493638416e-05, + "loss": 2.6996, + "step": 6631000 + }, + { + "epoch": 2.0614983360974373, + "grad_norm": 8.391697883605957, + "learning_rate": 1.5641694398376044e-05, + "loss": 2.6541, + "step": 6631500 + }, + { + "epoch": 2.061653768377924, + "grad_norm": 9.516386032104492, + "learning_rate": 1.5639103860367928e-05, + "loss": 2.6686, + "step": 6632000 + }, + { + "epoch": 2.061809200658411, + "grad_norm": 15.773314476013184, + "learning_rate": 1.5636513322359815e-05, + "loss": 2.6887, + "step": 6632500 + }, + { + "epoch": 2.061964632938898, + "grad_norm": 8.915635108947754, + "learning_rate": 1.5633922784351702e-05, + "loss": 2.6783, + "step": 6633000 + }, + { + "epoch": 2.0621200652193847, + "grad_norm": 11.828878402709961, + "learning_rate": 1.5631332246343586e-05, + "loss": 2.7376, + "step": 6633500 + }, + { + "epoch": 2.0622754974998716, + "grad_norm": 11.349981307983398, + "learning_rate": 1.562874170833547e-05, + "loss": 2.7141, + "step": 6634000 + }, + { + "epoch": 2.0624309297803585, + "grad_norm": 10.10114860534668, + "learning_rate": 1.5626151170327357e-05, + "loss": 2.6712, + "step": 6634500 + }, + { + "epoch": 2.0625863620608453, + "grad_norm": 9.388832092285156, + "learning_rate": 1.562356063231924e-05, + "loss": 2.7128, + "step": 6635000 + }, + { + "epoch": 2.062741794341332, + "grad_norm": 11.228326797485352, + "learning_rate": 1.5620970094311128e-05, + "loss": 2.6828, + "step": 6635500 + }, + { + "epoch": 2.062897226621819, + "grad_norm": 9.710440635681152, + "learning_rate": 1.5618379556303012e-05, + "loss": 2.7134, + "step": 6636000 + }, + { + "epoch": 2.063052658902306, + "grad_norm": 11.889323234558105, + "learning_rate": 1.56157890182949e-05, + "loss": 2.6587, + "step": 6636500 + }, + { + "epoch": 2.0632080911827932, + "grad_norm": 9.67951488494873, + "learning_rate": 1.5613198480286783e-05, + "loss": 2.6451, + "step": 6637000 + }, + { + "epoch": 2.06336352346328, + "grad_norm": 8.794925689697266, + "learning_rate": 1.561060794227867e-05, + "loss": 2.6694, + "step": 6637500 + }, + { + "epoch": 2.063518955743767, + "grad_norm": 8.755643844604492, + "learning_rate": 1.5608017404270557e-05, + "loss": 2.7079, + "step": 6638000 + }, + { + "epoch": 2.063674388024254, + "grad_norm": 9.5531644821167, + "learning_rate": 1.560542686626244e-05, + "loss": 2.6963, + "step": 6638500 + }, + { + "epoch": 2.0638298203047407, + "grad_norm": 9.97157096862793, + "learning_rate": 1.5602836328254325e-05, + "loss": 2.6711, + "step": 6639000 + }, + { + "epoch": 2.0639852525852276, + "grad_norm": 10.027703285217285, + "learning_rate": 1.560024579024621e-05, + "loss": 2.7004, + "step": 6639500 + }, + { + "epoch": 2.0641406848657144, + "grad_norm": 8.6510591506958, + "learning_rate": 1.5597655252238096e-05, + "loss": 2.6798, + "step": 6640000 + }, + { + "epoch": 2.0642961171462013, + "grad_norm": 12.67145824432373, + "learning_rate": 1.5595064714229983e-05, + "loss": 2.6671, + "step": 6640500 + }, + { + "epoch": 2.064451549426688, + "grad_norm": 19.509584426879883, + "learning_rate": 1.5592474176221866e-05, + "loss": 2.6733, + "step": 6641000 + }, + { + "epoch": 2.064606981707175, + "grad_norm": 9.22652816772461, + "learning_rate": 1.5589883638213754e-05, + "loss": 2.6836, + "step": 6641500 + }, + { + "epoch": 2.064762413987662, + "grad_norm": 9.960939407348633, + "learning_rate": 1.5587293100205637e-05, + "loss": 2.6546, + "step": 6642000 + }, + { + "epoch": 2.064917846268149, + "grad_norm": 10.639436721801758, + "learning_rate": 1.5584702562197525e-05, + "loss": 2.661, + "step": 6642500 + }, + { + "epoch": 2.0650732785486356, + "grad_norm": 7.741174221038818, + "learning_rate": 1.5582112024189408e-05, + "loss": 2.6871, + "step": 6643000 + }, + { + "epoch": 2.0652287108291225, + "grad_norm": 7.613457202911377, + "learning_rate": 1.5579521486181295e-05, + "loss": 2.7263, + "step": 6643500 + }, + { + "epoch": 2.0653841431096094, + "grad_norm": 10.309518814086914, + "learning_rate": 1.557693094817318e-05, + "loss": 2.6859, + "step": 6644000 + }, + { + "epoch": 2.0655395753900962, + "grad_norm": 10.195111274719238, + "learning_rate": 1.5574340410165063e-05, + "loss": 2.7001, + "step": 6644500 + }, + { + "epoch": 2.065695007670583, + "grad_norm": 8.428764343261719, + "learning_rate": 1.557174987215695e-05, + "loss": 2.6718, + "step": 6645000 + }, + { + "epoch": 2.06585043995107, + "grad_norm": 6.3646135330200195, + "learning_rate": 1.5569159334148837e-05, + "loss": 2.7351, + "step": 6645500 + }, + { + "epoch": 2.066005872231557, + "grad_norm": 9.34688949584961, + "learning_rate": 1.556656879614072e-05, + "loss": 2.6865, + "step": 6646000 + }, + { + "epoch": 2.0661613045120437, + "grad_norm": 9.884950637817383, + "learning_rate": 1.5563978258132605e-05, + "loss": 2.6121, + "step": 6646500 + }, + { + "epoch": 2.0663167367925306, + "grad_norm": 9.785995483398438, + "learning_rate": 1.5561387720124492e-05, + "loss": 2.7227, + "step": 6647000 + }, + { + "epoch": 2.0664721690730175, + "grad_norm": 12.12309741973877, + "learning_rate": 1.555879718211638e-05, + "loss": 2.7196, + "step": 6647500 + }, + { + "epoch": 2.0666276013535043, + "grad_norm": 30.198455810546875, + "learning_rate": 1.5556206644108263e-05, + "loss": 2.6879, + "step": 6648000 + }, + { + "epoch": 2.066783033633991, + "grad_norm": 18.05254554748535, + "learning_rate": 1.5553616106100147e-05, + "loss": 2.6515, + "step": 6648500 + }, + { + "epoch": 2.066938465914478, + "grad_norm": 11.884662628173828, + "learning_rate": 1.5551025568092034e-05, + "loss": 2.6491, + "step": 6649000 + }, + { + "epoch": 2.067093898194965, + "grad_norm": 12.451506614685059, + "learning_rate": 1.5548435030083918e-05, + "loss": 2.6979, + "step": 6649500 + }, + { + "epoch": 2.067249330475452, + "grad_norm": 7.614224433898926, + "learning_rate": 1.5545844492075805e-05, + "loss": 2.6746, + "step": 6650000 + }, + { + "epoch": 2.0674047627559387, + "grad_norm": 9.929543495178223, + "learning_rate": 1.5543253954067692e-05, + "loss": 2.7137, + "step": 6650500 + }, + { + "epoch": 2.0675601950364255, + "grad_norm": 11.763327598571777, + "learning_rate": 1.5540663416059576e-05, + "loss": 2.6517, + "step": 6651000 + }, + { + "epoch": 2.0677156273169124, + "grad_norm": 9.738743782043457, + "learning_rate": 1.553807287805146e-05, + "loss": 2.6722, + "step": 6651500 + }, + { + "epoch": 2.0678710595973993, + "grad_norm": 9.21399211883545, + "learning_rate": 1.5535482340043343e-05, + "loss": 2.6764, + "step": 6652000 + }, + { + "epoch": 2.068026491877886, + "grad_norm": 10.831578254699707, + "learning_rate": 1.5532891802035234e-05, + "loss": 2.7175, + "step": 6652500 + }, + { + "epoch": 2.068181924158373, + "grad_norm": 8.461090087890625, + "learning_rate": 1.5530301264027117e-05, + "loss": 2.6764, + "step": 6653000 + }, + { + "epoch": 2.06833735643886, + "grad_norm": 11.311217308044434, + "learning_rate": 1.5527710726019e-05, + "loss": 2.7115, + "step": 6653500 + }, + { + "epoch": 2.0684927887193467, + "grad_norm": 9.600846290588379, + "learning_rate": 1.5525120188010885e-05, + "loss": 2.6684, + "step": 6654000 + }, + { + "epoch": 2.0686482209998336, + "grad_norm": 9.578808784484863, + "learning_rate": 1.5522529650002772e-05, + "loss": 2.7096, + "step": 6654500 + }, + { + "epoch": 2.0688036532803205, + "grad_norm": 10.168962478637695, + "learning_rate": 1.551993911199466e-05, + "loss": 2.6542, + "step": 6655000 + }, + { + "epoch": 2.0689590855608073, + "grad_norm": 17.72127342224121, + "learning_rate": 1.5517348573986543e-05, + "loss": 2.7357, + "step": 6655500 + }, + { + "epoch": 2.069114517841294, + "grad_norm": 9.248409271240234, + "learning_rate": 1.551475803597843e-05, + "loss": 2.6497, + "step": 6656000 + }, + { + "epoch": 2.069269950121781, + "grad_norm": 9.005361557006836, + "learning_rate": 1.5512167497970314e-05, + "loss": 2.6642, + "step": 6656500 + }, + { + "epoch": 2.069425382402268, + "grad_norm": 36.39203643798828, + "learning_rate": 1.5509576959962198e-05, + "loss": 2.6839, + "step": 6657000 + }, + { + "epoch": 2.069580814682755, + "grad_norm": 38.25336456298828, + "learning_rate": 1.5506986421954085e-05, + "loss": 2.7051, + "step": 6657500 + }, + { + "epoch": 2.0697362469632417, + "grad_norm": 22.887374877929688, + "learning_rate": 1.5504395883945972e-05, + "loss": 2.6718, + "step": 6658000 + }, + { + "epoch": 2.0698916792437285, + "grad_norm": 10.928617477416992, + "learning_rate": 1.5501805345937856e-05, + "loss": 2.6845, + "step": 6658500 + }, + { + "epoch": 2.0700471115242154, + "grad_norm": 9.581314086914062, + "learning_rate": 1.549921480792974e-05, + "loss": 2.6685, + "step": 6659000 + }, + { + "epoch": 2.0702025438047023, + "grad_norm": 8.817415237426758, + "learning_rate": 1.5496624269921627e-05, + "loss": 2.6854, + "step": 6659500 + }, + { + "epoch": 2.070357976085189, + "grad_norm": 18.146459579467773, + "learning_rate": 1.5494033731913514e-05, + "loss": 2.6314, + "step": 6660000 + }, + { + "epoch": 2.070513408365676, + "grad_norm": 8.677345275878906, + "learning_rate": 1.5491443193905398e-05, + "loss": 2.6389, + "step": 6660500 + }, + { + "epoch": 2.070668840646163, + "grad_norm": 9.805794715881348, + "learning_rate": 1.548885265589728e-05, + "loss": 2.6702, + "step": 6661000 + }, + { + "epoch": 2.07082427292665, + "grad_norm": 10.031074523925781, + "learning_rate": 1.548626211788917e-05, + "loss": 2.69, + "step": 6661500 + }, + { + "epoch": 2.070979705207137, + "grad_norm": 8.687263488769531, + "learning_rate": 1.5483671579881052e-05, + "loss": 2.6676, + "step": 6662000 + }, + { + "epoch": 2.071135137487624, + "grad_norm": 9.001849174499512, + "learning_rate": 1.548108104187294e-05, + "loss": 2.6597, + "step": 6662500 + }, + { + "epoch": 2.0712905697681108, + "grad_norm": 8.772978782653809, + "learning_rate": 1.5478490503864823e-05, + "loss": 2.681, + "step": 6663000 + }, + { + "epoch": 2.0714460020485976, + "grad_norm": 10.087295532226562, + "learning_rate": 1.547589996585671e-05, + "loss": 2.6592, + "step": 6663500 + }, + { + "epoch": 2.0716014343290845, + "grad_norm": 9.455381393432617, + "learning_rate": 1.5473309427848594e-05, + "loss": 2.6587, + "step": 6664000 + }, + { + "epoch": 2.0717568666095714, + "grad_norm": 9.780304908752441, + "learning_rate": 1.547071888984048e-05, + "loss": 2.6592, + "step": 6664500 + }, + { + "epoch": 2.0719122988900582, + "grad_norm": 12.395410537719727, + "learning_rate": 1.546812835183237e-05, + "loss": 2.6688, + "step": 6665000 + }, + { + "epoch": 2.072067731170545, + "grad_norm": 10.605031967163086, + "learning_rate": 1.5465537813824252e-05, + "loss": 2.6887, + "step": 6665500 + }, + { + "epoch": 2.072223163451032, + "grad_norm": 8.836790084838867, + "learning_rate": 1.5462947275816136e-05, + "loss": 2.7593, + "step": 6666000 + }, + { + "epoch": 2.072378595731519, + "grad_norm": 7.911892890930176, + "learning_rate": 1.546035673780802e-05, + "loss": 2.7319, + "step": 6666500 + }, + { + "epoch": 2.0725340280120057, + "grad_norm": 26.44670867919922, + "learning_rate": 1.5457766199799907e-05, + "loss": 2.7507, + "step": 6667000 + }, + { + "epoch": 2.0726894602924926, + "grad_norm": 7.449582576751709, + "learning_rate": 1.5455175661791794e-05, + "loss": 2.698, + "step": 6667500 + }, + { + "epoch": 2.0728448925729794, + "grad_norm": 10.016100883483887, + "learning_rate": 1.5452585123783678e-05, + "loss": 2.7423, + "step": 6668000 + }, + { + "epoch": 2.0730003248534663, + "grad_norm": 14.788844108581543, + "learning_rate": 1.5449994585775565e-05, + "loss": 2.6628, + "step": 6668500 + }, + { + "epoch": 2.073155757133953, + "grad_norm": 10.077733993530273, + "learning_rate": 1.544740404776745e-05, + "loss": 2.6916, + "step": 6669000 + }, + { + "epoch": 2.07331118941444, + "grad_norm": 11.683755874633789, + "learning_rate": 1.5444813509759336e-05, + "loss": 2.6226, + "step": 6669500 + }, + { + "epoch": 2.073466621694927, + "grad_norm": 8.714752197265625, + "learning_rate": 1.544222297175122e-05, + "loss": 2.658, + "step": 6670000 + }, + { + "epoch": 2.0736220539754138, + "grad_norm": 11.544998168945312, + "learning_rate": 1.5439632433743107e-05, + "loss": 2.6391, + "step": 6670500 + }, + { + "epoch": 2.0737774862559006, + "grad_norm": 8.286722183227539, + "learning_rate": 1.543704189573499e-05, + "loss": 2.7118, + "step": 6671000 + }, + { + "epoch": 2.0739329185363875, + "grad_norm": 9.227900505065918, + "learning_rate": 1.5434451357726874e-05, + "loss": 2.7029, + "step": 6671500 + }, + { + "epoch": 2.0740883508168744, + "grad_norm": 10.815176010131836, + "learning_rate": 1.543186081971876e-05, + "loss": 2.6689, + "step": 6672000 + }, + { + "epoch": 2.0742437830973612, + "grad_norm": 11.520392417907715, + "learning_rate": 1.542927028171065e-05, + "loss": 2.6556, + "step": 6672500 + }, + { + "epoch": 2.074399215377848, + "grad_norm": 7.875696659088135, + "learning_rate": 1.5426679743702532e-05, + "loss": 2.6818, + "step": 6673000 + }, + { + "epoch": 2.074554647658335, + "grad_norm": 37.32448196411133, + "learning_rate": 1.5424089205694416e-05, + "loss": 2.6489, + "step": 6673500 + }, + { + "epoch": 2.074710079938822, + "grad_norm": 9.286386489868164, + "learning_rate": 1.5421498667686303e-05, + "loss": 2.6844, + "step": 6674000 + }, + { + "epoch": 2.0748655122193087, + "grad_norm": 10.174944877624512, + "learning_rate": 1.541890812967819e-05, + "loss": 2.6534, + "step": 6674500 + }, + { + "epoch": 2.0750209444997956, + "grad_norm": 11.02733325958252, + "learning_rate": 1.5416317591670074e-05, + "loss": 2.6623, + "step": 6675000 + }, + { + "epoch": 2.0751763767802824, + "grad_norm": 10.093646049499512, + "learning_rate": 1.5413727053661958e-05, + "loss": 2.6948, + "step": 6675500 + }, + { + "epoch": 2.0753318090607693, + "grad_norm": 5.293300628662109, + "learning_rate": 1.5411136515653845e-05, + "loss": 2.6908, + "step": 6676000 + }, + { + "epoch": 2.075487241341256, + "grad_norm": 10.371970176696777, + "learning_rate": 1.540854597764573e-05, + "loss": 2.6655, + "step": 6676500 + }, + { + "epoch": 2.075642673621743, + "grad_norm": 11.185344696044922, + "learning_rate": 1.5405955439637616e-05, + "loss": 2.6559, + "step": 6677000 + }, + { + "epoch": 2.07579810590223, + "grad_norm": 9.789923667907715, + "learning_rate": 1.5403364901629503e-05, + "loss": 2.6874, + "step": 6677500 + }, + { + "epoch": 2.0759535381827168, + "grad_norm": 9.240958213806152, + "learning_rate": 1.5400774363621387e-05, + "loss": 2.6835, + "step": 6678000 + }, + { + "epoch": 2.0761089704632036, + "grad_norm": 8.699371337890625, + "learning_rate": 1.539818382561327e-05, + "loss": 2.6435, + "step": 6678500 + }, + { + "epoch": 2.0762644027436905, + "grad_norm": 13.295795440673828, + "learning_rate": 1.5395593287605155e-05, + "loss": 2.682, + "step": 6679000 + }, + { + "epoch": 2.0764198350241774, + "grad_norm": 12.754226684570312, + "learning_rate": 1.5393002749597045e-05, + "loss": 2.7186, + "step": 6679500 + }, + { + "epoch": 2.0765752673046642, + "grad_norm": 9.103994369506836, + "learning_rate": 1.539041221158893e-05, + "loss": 2.6546, + "step": 6680000 + }, + { + "epoch": 2.076730699585151, + "grad_norm": 9.096620559692383, + "learning_rate": 1.5387821673580813e-05, + "loss": 2.6774, + "step": 6680500 + }, + { + "epoch": 2.076886131865638, + "grad_norm": 9.970149040222168, + "learning_rate": 1.5385231135572696e-05, + "loss": 2.7277, + "step": 6681000 + }, + { + "epoch": 2.077041564146125, + "grad_norm": 9.844231605529785, + "learning_rate": 1.5382640597564584e-05, + "loss": 2.664, + "step": 6681500 + }, + { + "epoch": 2.0771969964266117, + "grad_norm": 8.72933578491211, + "learning_rate": 1.538005005955647e-05, + "loss": 2.6626, + "step": 6682000 + }, + { + "epoch": 2.0773524287070986, + "grad_norm": 9.853575706481934, + "learning_rate": 1.5377459521548354e-05, + "loss": 2.6897, + "step": 6682500 + }, + { + "epoch": 2.0775078609875854, + "grad_norm": 10.057190895080566, + "learning_rate": 1.537486898354024e-05, + "loss": 2.7117, + "step": 6683000 + }, + { + "epoch": 2.0776632932680723, + "grad_norm": 10.70521354675293, + "learning_rate": 1.5372278445532125e-05, + "loss": 2.6999, + "step": 6683500 + }, + { + "epoch": 2.077818725548559, + "grad_norm": 8.927663803100586, + "learning_rate": 1.536968790752401e-05, + "loss": 2.6328, + "step": 6684000 + }, + { + "epoch": 2.077974157829046, + "grad_norm": 11.43118953704834, + "learning_rate": 1.5367097369515896e-05, + "loss": 2.7093, + "step": 6684500 + }, + { + "epoch": 2.0781295901095334, + "grad_norm": 11.22292423248291, + "learning_rate": 1.5364506831507783e-05, + "loss": 2.696, + "step": 6685000 + }, + { + "epoch": 2.0782850223900198, + "grad_norm": 16.415687561035156, + "learning_rate": 1.5361916293499667e-05, + "loss": 2.6915, + "step": 6685500 + }, + { + "epoch": 2.078440454670507, + "grad_norm": 23.842628479003906, + "learning_rate": 1.535932575549155e-05, + "loss": 2.7352, + "step": 6686000 + }, + { + "epoch": 2.078595886950994, + "grad_norm": 8.687894821166992, + "learning_rate": 1.5356735217483438e-05, + "loss": 2.6593, + "step": 6686500 + }, + { + "epoch": 2.078751319231481, + "grad_norm": 8.847914695739746, + "learning_rate": 1.5354144679475325e-05, + "loss": 2.6549, + "step": 6687000 + }, + { + "epoch": 2.0789067515119677, + "grad_norm": 11.53581428527832, + "learning_rate": 1.535155414146721e-05, + "loss": 2.6737, + "step": 6687500 + }, + { + "epoch": 2.0790621837924546, + "grad_norm": 10.741677284240723, + "learning_rate": 1.5348963603459093e-05, + "loss": 2.7124, + "step": 6688000 + }, + { + "epoch": 2.0792176160729414, + "grad_norm": 39.98200607299805, + "learning_rate": 1.534637306545098e-05, + "loss": 2.691, + "step": 6688500 + }, + { + "epoch": 2.0793730483534283, + "grad_norm": 9.456578254699707, + "learning_rate": 1.5343782527442864e-05, + "loss": 2.6659, + "step": 6689000 + }, + { + "epoch": 2.079528480633915, + "grad_norm": 9.608175277709961, + "learning_rate": 1.534119198943475e-05, + "loss": 2.6809, + "step": 6689500 + }, + { + "epoch": 2.079683912914402, + "grad_norm": 11.421225547790527, + "learning_rate": 1.5338601451426635e-05, + "loss": 2.7143, + "step": 6690000 + }, + { + "epoch": 2.079839345194889, + "grad_norm": 10.252606391906738, + "learning_rate": 1.5336010913418522e-05, + "loss": 2.6667, + "step": 6690500 + }, + { + "epoch": 2.0799947774753758, + "grad_norm": 9.372246742248535, + "learning_rate": 1.5333420375410406e-05, + "loss": 2.6937, + "step": 6691000 + }, + { + "epoch": 2.0801502097558626, + "grad_norm": 7.959596633911133, + "learning_rate": 1.5330829837402293e-05, + "loss": 2.7, + "step": 6691500 + }, + { + "epoch": 2.0803056420363495, + "grad_norm": 8.63485050201416, + "learning_rate": 1.532823929939418e-05, + "loss": 2.696, + "step": 6692000 + }, + { + "epoch": 2.0804610743168364, + "grad_norm": 9.269721031188965, + "learning_rate": 1.5325648761386064e-05, + "loss": 2.6681, + "step": 6692500 + }, + { + "epoch": 2.0806165065973232, + "grad_norm": 10.276688575744629, + "learning_rate": 1.5323058223377947e-05, + "loss": 2.6587, + "step": 6693000 + }, + { + "epoch": 2.08077193887781, + "grad_norm": 11.27310562133789, + "learning_rate": 1.532046768536983e-05, + "loss": 2.6761, + "step": 6693500 + }, + { + "epoch": 2.080927371158297, + "grad_norm": 9.38005542755127, + "learning_rate": 1.531787714736172e-05, + "loss": 2.6725, + "step": 6694000 + }, + { + "epoch": 2.081082803438784, + "grad_norm": 15.609776496887207, + "learning_rate": 1.5315286609353606e-05, + "loss": 2.6624, + "step": 6694500 + }, + { + "epoch": 2.0812382357192707, + "grad_norm": 26.405479431152344, + "learning_rate": 1.531269607134549e-05, + "loss": 2.6656, + "step": 6695000 + }, + { + "epoch": 2.0813936679997576, + "grad_norm": 9.686834335327148, + "learning_rate": 1.5310105533337376e-05, + "loss": 2.7111, + "step": 6695500 + }, + { + "epoch": 2.0815491002802444, + "grad_norm": 8.997300148010254, + "learning_rate": 1.530751499532926e-05, + "loss": 2.6638, + "step": 6696000 + }, + { + "epoch": 2.0817045325607313, + "grad_norm": 17.896526336669922, + "learning_rate": 1.5304924457321147e-05, + "loss": 2.6416, + "step": 6696500 + }, + { + "epoch": 2.081859964841218, + "grad_norm": 11.05445384979248, + "learning_rate": 1.530233391931303e-05, + "loss": 2.6668, + "step": 6697000 + }, + { + "epoch": 2.082015397121705, + "grad_norm": 9.866427421569824, + "learning_rate": 1.5299743381304918e-05, + "loss": 2.7078, + "step": 6697500 + }, + { + "epoch": 2.082170829402192, + "grad_norm": 9.20074462890625, + "learning_rate": 1.5297152843296802e-05, + "loss": 2.6868, + "step": 6698000 + }, + { + "epoch": 2.0823262616826788, + "grad_norm": 9.141411781311035, + "learning_rate": 1.5294562305288686e-05, + "loss": 2.7244, + "step": 6698500 + }, + { + "epoch": 2.0824816939631656, + "grad_norm": 8.485459327697754, + "learning_rate": 1.5291971767280573e-05, + "loss": 2.6628, + "step": 6699000 + }, + { + "epoch": 2.0826371262436525, + "grad_norm": 11.533233642578125, + "learning_rate": 1.528938122927246e-05, + "loss": 2.6769, + "step": 6699500 + }, + { + "epoch": 2.0827925585241394, + "grad_norm": 10.637045860290527, + "learning_rate": 1.5286790691264344e-05, + "loss": 2.6515, + "step": 6700000 + }, + { + "epoch": 2.0829479908046262, + "grad_norm": 9.887364387512207, + "learning_rate": 1.5284200153256228e-05, + "loss": 2.6968, + "step": 6700500 + }, + { + "epoch": 2.083103423085113, + "grad_norm": 10.424408912658691, + "learning_rate": 1.5281609615248115e-05, + "loss": 2.7083, + "step": 6701000 + }, + { + "epoch": 2.0832588553656, + "grad_norm": 9.883237838745117, + "learning_rate": 1.5279019077240002e-05, + "loss": 2.7044, + "step": 6701500 + }, + { + "epoch": 2.083414287646087, + "grad_norm": 11.457036018371582, + "learning_rate": 1.5276428539231886e-05, + "loss": 2.6884, + "step": 6702000 + }, + { + "epoch": 2.0835697199265737, + "grad_norm": 10.489386558532715, + "learning_rate": 1.527383800122377e-05, + "loss": 2.6406, + "step": 6702500 + }, + { + "epoch": 2.0837251522070606, + "grad_norm": 21.280075073242188, + "learning_rate": 1.5271247463215657e-05, + "loss": 2.6624, + "step": 6703000 + }, + { + "epoch": 2.0838805844875474, + "grad_norm": 114.2562026977539, + "learning_rate": 1.526865692520754e-05, + "loss": 2.6906, + "step": 6703500 + }, + { + "epoch": 2.0840360167680343, + "grad_norm": 34.50334548950195, + "learning_rate": 1.5266066387199428e-05, + "loss": 2.7393, + "step": 6704000 + }, + { + "epoch": 2.084191449048521, + "grad_norm": 10.662958145141602, + "learning_rate": 1.5263475849191315e-05, + "loss": 2.7128, + "step": 6704500 + }, + { + "epoch": 2.084346881329008, + "grad_norm": 9.369436264038086, + "learning_rate": 1.52608853111832e-05, + "loss": 2.6864, + "step": 6705000 + }, + { + "epoch": 2.084502313609495, + "grad_norm": 16.157978057861328, + "learning_rate": 1.5258294773175082e-05, + "loss": 2.6761, + "step": 6705500 + }, + { + "epoch": 2.0846577458899818, + "grad_norm": 9.293479919433594, + "learning_rate": 1.5255704235166968e-05, + "loss": 2.6599, + "step": 6706000 + }, + { + "epoch": 2.0848131781704686, + "grad_norm": 9.957204818725586, + "learning_rate": 1.5253113697158855e-05, + "loss": 2.6957, + "step": 6706500 + }, + { + "epoch": 2.0849686104509555, + "grad_norm": 10.562093734741211, + "learning_rate": 1.525052315915074e-05, + "loss": 2.6885, + "step": 6707000 + }, + { + "epoch": 2.0851240427314424, + "grad_norm": 9.916892051696777, + "learning_rate": 1.5247932621142624e-05, + "loss": 2.6848, + "step": 6707500 + }, + { + "epoch": 2.0852794750119292, + "grad_norm": 19.559961318969727, + "learning_rate": 1.524534208313451e-05, + "loss": 2.6569, + "step": 6708000 + }, + { + "epoch": 2.085434907292416, + "grad_norm": 10.772778511047363, + "learning_rate": 1.5242751545126397e-05, + "loss": 2.6989, + "step": 6708500 + }, + { + "epoch": 2.085590339572903, + "grad_norm": 9.718379020690918, + "learning_rate": 1.524016100711828e-05, + "loss": 2.7195, + "step": 6709000 + }, + { + "epoch": 2.0857457718533903, + "grad_norm": 9.163044929504395, + "learning_rate": 1.5237570469110166e-05, + "loss": 2.6899, + "step": 6709500 + }, + { + "epoch": 2.085901204133877, + "grad_norm": 11.348213195800781, + "learning_rate": 1.5234979931102053e-05, + "loss": 2.6434, + "step": 6710000 + }, + { + "epoch": 2.086056636414364, + "grad_norm": 30.90701675415039, + "learning_rate": 1.5232389393093937e-05, + "loss": 2.6998, + "step": 6710500 + }, + { + "epoch": 2.086212068694851, + "grad_norm": 8.528834342956543, + "learning_rate": 1.5229798855085822e-05, + "loss": 2.6863, + "step": 6711000 + }, + { + "epoch": 2.0863675009753377, + "grad_norm": 6.768044948577881, + "learning_rate": 1.5227208317077708e-05, + "loss": 2.597, + "step": 6711500 + }, + { + "epoch": 2.0865229332558246, + "grad_norm": 9.506616592407227, + "learning_rate": 1.5224617779069595e-05, + "loss": 2.673, + "step": 6712000 + }, + { + "epoch": 2.0866783655363115, + "grad_norm": 8.672966957092285, + "learning_rate": 1.5222027241061479e-05, + "loss": 2.6502, + "step": 6712500 + }, + { + "epoch": 2.0868337978167983, + "grad_norm": 7.1213860511779785, + "learning_rate": 1.5219436703053364e-05, + "loss": 2.6875, + "step": 6713000 + }, + { + "epoch": 2.086989230097285, + "grad_norm": 9.139581680297852, + "learning_rate": 1.5216846165045251e-05, + "loss": 2.6968, + "step": 6713500 + }, + { + "epoch": 2.087144662377772, + "grad_norm": 8.708588600158691, + "learning_rate": 1.5214255627037135e-05, + "loss": 2.6931, + "step": 6714000 + }, + { + "epoch": 2.087300094658259, + "grad_norm": 8.99721622467041, + "learning_rate": 1.521166508902902e-05, + "loss": 2.6637, + "step": 6714500 + }, + { + "epoch": 2.087455526938746, + "grad_norm": 9.27236557006836, + "learning_rate": 1.5209074551020904e-05, + "loss": 2.6811, + "step": 6715000 + }, + { + "epoch": 2.0876109592192327, + "grad_norm": 12.827768325805664, + "learning_rate": 1.5206484013012791e-05, + "loss": 2.6781, + "step": 6715500 + }, + { + "epoch": 2.0877663914997195, + "grad_norm": 8.669366836547852, + "learning_rate": 1.5203893475004677e-05, + "loss": 2.6469, + "step": 6716000 + }, + { + "epoch": 2.0879218237802064, + "grad_norm": 8.806619644165039, + "learning_rate": 1.5201302936996562e-05, + "loss": 2.6545, + "step": 6716500 + }, + { + "epoch": 2.0880772560606933, + "grad_norm": 51.091575622558594, + "learning_rate": 1.5198712398988446e-05, + "loss": 2.7205, + "step": 6717000 + }, + { + "epoch": 2.08823268834118, + "grad_norm": 8.72520923614502, + "learning_rate": 1.5196121860980333e-05, + "loss": 2.6627, + "step": 6717500 + }, + { + "epoch": 2.088388120621667, + "grad_norm": 9.642910957336426, + "learning_rate": 1.5193531322972219e-05, + "loss": 2.6841, + "step": 6718000 + }, + { + "epoch": 2.088543552902154, + "grad_norm": 9.890599250793457, + "learning_rate": 1.5190940784964102e-05, + "loss": 2.6295, + "step": 6718500 + }, + { + "epoch": 2.0886989851826407, + "grad_norm": 10.30260944366455, + "learning_rate": 1.518835024695599e-05, + "loss": 2.6583, + "step": 6719000 + }, + { + "epoch": 2.0888544174631276, + "grad_norm": 12.211572647094727, + "learning_rate": 1.5185759708947875e-05, + "loss": 2.6576, + "step": 6719500 + }, + { + "epoch": 2.0890098497436145, + "grad_norm": 16.386707305908203, + "learning_rate": 1.5183169170939759e-05, + "loss": 2.6861, + "step": 6720000 + }, + { + "epoch": 2.0891652820241013, + "grad_norm": 8.68195629119873, + "learning_rate": 1.5180578632931644e-05, + "loss": 2.6885, + "step": 6720500 + }, + { + "epoch": 2.089320714304588, + "grad_norm": 9.850728034973145, + "learning_rate": 1.5177988094923531e-05, + "loss": 2.6777, + "step": 6721000 + }, + { + "epoch": 2.089476146585075, + "grad_norm": 7.297066688537598, + "learning_rate": 1.5175397556915417e-05, + "loss": 2.721, + "step": 6721500 + }, + { + "epoch": 2.089631578865562, + "grad_norm": 9.759649276733398, + "learning_rate": 1.51728070189073e-05, + "loss": 2.6676, + "step": 6722000 + }, + { + "epoch": 2.089787011146049, + "grad_norm": 8.437259674072266, + "learning_rate": 1.5170216480899188e-05, + "loss": 2.704, + "step": 6722500 + }, + { + "epoch": 2.0899424434265357, + "grad_norm": 9.37203311920166, + "learning_rate": 1.5167625942891073e-05, + "loss": 2.6873, + "step": 6723000 + }, + { + "epoch": 2.0900978757070225, + "grad_norm": 9.304635047912598, + "learning_rate": 1.5165035404882957e-05, + "loss": 2.6318, + "step": 6723500 + }, + { + "epoch": 2.0902533079875094, + "grad_norm": 10.344898223876953, + "learning_rate": 1.5162444866874843e-05, + "loss": 2.718, + "step": 6724000 + }, + { + "epoch": 2.0904087402679963, + "grad_norm": 22.987537384033203, + "learning_rate": 1.515985432886673e-05, + "loss": 2.6808, + "step": 6724500 + }, + { + "epoch": 2.090564172548483, + "grad_norm": 9.290575981140137, + "learning_rate": 1.5157263790858613e-05, + "loss": 2.7194, + "step": 6725000 + }, + { + "epoch": 2.09071960482897, + "grad_norm": 10.669219970703125, + "learning_rate": 1.5154673252850499e-05, + "loss": 2.7155, + "step": 6725500 + }, + { + "epoch": 2.090875037109457, + "grad_norm": 6.337404251098633, + "learning_rate": 1.5152082714842383e-05, + "loss": 2.673, + "step": 6726000 + }, + { + "epoch": 2.0910304693899437, + "grad_norm": 13.194384574890137, + "learning_rate": 1.5149492176834272e-05, + "loss": 2.6596, + "step": 6726500 + }, + { + "epoch": 2.0911859016704306, + "grad_norm": 11.029769897460938, + "learning_rate": 1.5146901638826155e-05, + "loss": 2.6582, + "step": 6727000 + }, + { + "epoch": 2.0913413339509175, + "grad_norm": 31.673545837402344, + "learning_rate": 1.514431110081804e-05, + "loss": 2.6776, + "step": 6727500 + }, + { + "epoch": 2.0914967662314043, + "grad_norm": 9.874479293823242, + "learning_rate": 1.5141720562809928e-05, + "loss": 2.644, + "step": 6728000 + }, + { + "epoch": 2.091652198511891, + "grad_norm": 9.478946685791016, + "learning_rate": 1.5139130024801812e-05, + "loss": 2.6738, + "step": 6728500 + }, + { + "epoch": 2.091807630792378, + "grad_norm": 25.18596649169922, + "learning_rate": 1.5136539486793697e-05, + "loss": 2.6652, + "step": 6729000 + }, + { + "epoch": 2.091963063072865, + "grad_norm": 9.020874977111816, + "learning_rate": 1.5133948948785581e-05, + "loss": 2.7135, + "step": 6729500 + }, + { + "epoch": 2.092118495353352, + "grad_norm": 8.699000358581543, + "learning_rate": 1.5131358410777468e-05, + "loss": 2.7451, + "step": 6730000 + }, + { + "epoch": 2.0922739276338387, + "grad_norm": 9.270832061767578, + "learning_rate": 1.5128767872769353e-05, + "loss": 2.6706, + "step": 6730500 + }, + { + "epoch": 2.0924293599143255, + "grad_norm": 8.328203201293945, + "learning_rate": 1.5126177334761237e-05, + "loss": 2.6492, + "step": 6731000 + }, + { + "epoch": 2.0925847921948124, + "grad_norm": 17.146059036254883, + "learning_rate": 1.5123586796753126e-05, + "loss": 2.6207, + "step": 6731500 + }, + { + "epoch": 2.0927402244752993, + "grad_norm": 10.462145805358887, + "learning_rate": 1.512099625874501e-05, + "loss": 2.6788, + "step": 6732000 + }, + { + "epoch": 2.092895656755786, + "grad_norm": 13.4699125289917, + "learning_rate": 1.5118405720736895e-05, + "loss": 2.6367, + "step": 6732500 + }, + { + "epoch": 2.0930510890362735, + "grad_norm": 14.054478645324707, + "learning_rate": 1.5115815182728779e-05, + "loss": 2.7204, + "step": 6733000 + }, + { + "epoch": 2.09320652131676, + "grad_norm": 12.539753913879395, + "learning_rate": 1.5113224644720666e-05, + "loss": 2.7045, + "step": 6733500 + }, + { + "epoch": 2.093361953597247, + "grad_norm": 11.979164123535156, + "learning_rate": 1.5110634106712552e-05, + "loss": 2.632, + "step": 6734000 + }, + { + "epoch": 2.093517385877734, + "grad_norm": 10.460380554199219, + "learning_rate": 1.5108043568704435e-05, + "loss": 2.6949, + "step": 6734500 + }, + { + "epoch": 2.093672818158221, + "grad_norm": 25.839351654052734, + "learning_rate": 1.5105453030696321e-05, + "loss": 2.6939, + "step": 6735000 + }, + { + "epoch": 2.093828250438708, + "grad_norm": 14.909442901611328, + "learning_rate": 1.5102862492688208e-05, + "loss": 2.7111, + "step": 6735500 + }, + { + "epoch": 2.0939836827191947, + "grad_norm": 12.946710586547852, + "learning_rate": 1.5100271954680092e-05, + "loss": 2.7126, + "step": 6736000 + }, + { + "epoch": 2.0941391149996815, + "grad_norm": 10.151771545410156, + "learning_rate": 1.5097681416671977e-05, + "loss": 2.6317, + "step": 6736500 + }, + { + "epoch": 2.0942945472801684, + "grad_norm": 8.471571922302246, + "learning_rate": 1.5095090878663864e-05, + "loss": 2.6593, + "step": 6737000 + }, + { + "epoch": 2.0944499795606553, + "grad_norm": 8.608681678771973, + "learning_rate": 1.509250034065575e-05, + "loss": 2.6655, + "step": 6737500 + }, + { + "epoch": 2.094605411841142, + "grad_norm": 10.031418800354004, + "learning_rate": 1.5089909802647634e-05, + "loss": 2.694, + "step": 6738000 + }, + { + "epoch": 2.094760844121629, + "grad_norm": 35.83427047729492, + "learning_rate": 1.5087319264639519e-05, + "loss": 2.6685, + "step": 6738500 + }, + { + "epoch": 2.094916276402116, + "grad_norm": 26.244508743286133, + "learning_rate": 1.5084728726631406e-05, + "loss": 2.6845, + "step": 6739000 + }, + { + "epoch": 2.0950717086826027, + "grad_norm": 10.899681091308594, + "learning_rate": 1.508213818862329e-05, + "loss": 2.6449, + "step": 6739500 + }, + { + "epoch": 2.0952271409630896, + "grad_norm": 9.683813095092773, + "learning_rate": 1.5079547650615176e-05, + "loss": 2.6757, + "step": 6740000 + }, + { + "epoch": 2.0953825732435765, + "grad_norm": 12.463815689086914, + "learning_rate": 1.5076957112607063e-05, + "loss": 2.65, + "step": 6740500 + }, + { + "epoch": 2.0955380055240633, + "grad_norm": 8.890645027160645, + "learning_rate": 1.5074366574598946e-05, + "loss": 2.6731, + "step": 6741000 + }, + { + "epoch": 2.09569343780455, + "grad_norm": 9.108452796936035, + "learning_rate": 1.5071776036590832e-05, + "loss": 2.6639, + "step": 6741500 + }, + { + "epoch": 2.095848870085037, + "grad_norm": 9.783312797546387, + "learning_rate": 1.5069185498582716e-05, + "loss": 2.688, + "step": 6742000 + }, + { + "epoch": 2.096004302365524, + "grad_norm": 9.133179664611816, + "learning_rate": 1.5066594960574605e-05, + "loss": 2.7024, + "step": 6742500 + }, + { + "epoch": 2.096159734646011, + "grad_norm": 10.839134216308594, + "learning_rate": 1.5064004422566488e-05, + "loss": 2.7056, + "step": 6743000 + }, + { + "epoch": 2.0963151669264977, + "grad_norm": 11.034036636352539, + "learning_rate": 1.5061413884558374e-05, + "loss": 2.6706, + "step": 6743500 + }, + { + "epoch": 2.0964705992069845, + "grad_norm": 9.149762153625488, + "learning_rate": 1.5058823346550258e-05, + "loss": 2.6584, + "step": 6744000 + }, + { + "epoch": 2.0966260314874714, + "grad_norm": 8.359683990478516, + "learning_rate": 1.5056232808542145e-05, + "loss": 2.6385, + "step": 6744500 + }, + { + "epoch": 2.0967814637679583, + "grad_norm": 10.051519393920898, + "learning_rate": 1.505364227053403e-05, + "loss": 2.6928, + "step": 6745000 + }, + { + "epoch": 2.096936896048445, + "grad_norm": 6.7253642082214355, + "learning_rate": 1.5051051732525914e-05, + "loss": 2.6712, + "step": 6745500 + }, + { + "epoch": 2.097092328328932, + "grad_norm": 10.15164852142334, + "learning_rate": 1.5048461194517801e-05, + "loss": 2.7171, + "step": 6746000 + }, + { + "epoch": 2.097247760609419, + "grad_norm": 12.993850708007812, + "learning_rate": 1.5045870656509686e-05, + "loss": 2.7012, + "step": 6746500 + }, + { + "epoch": 2.0974031928899057, + "grad_norm": 10.579489707946777, + "learning_rate": 1.504328011850157e-05, + "loss": 2.7024, + "step": 6747000 + }, + { + "epoch": 2.0975586251703926, + "grad_norm": 8.159491539001465, + "learning_rate": 1.5040689580493456e-05, + "loss": 2.6501, + "step": 6747500 + }, + { + "epoch": 2.0977140574508795, + "grad_norm": 8.916122436523438, + "learning_rate": 1.5038099042485343e-05, + "loss": 2.6209, + "step": 6748000 + }, + { + "epoch": 2.0978694897313663, + "grad_norm": 9.698123931884766, + "learning_rate": 1.5035508504477228e-05, + "loss": 2.674, + "step": 6748500 + }, + { + "epoch": 2.098024922011853, + "grad_norm": 12.242055892944336, + "learning_rate": 1.5032917966469112e-05, + "loss": 2.6644, + "step": 6749000 + }, + { + "epoch": 2.09818035429234, + "grad_norm": 9.73514175415039, + "learning_rate": 1.5030327428461e-05, + "loss": 2.6762, + "step": 6749500 + }, + { + "epoch": 2.098335786572827, + "grad_norm": 8.521147727966309, + "learning_rate": 1.5027736890452885e-05, + "loss": 2.6507, + "step": 6750000 + }, + { + "epoch": 2.098491218853314, + "grad_norm": 8.60849380493164, + "learning_rate": 1.5025146352444768e-05, + "loss": 2.7145, + "step": 6750500 + }, + { + "epoch": 2.0986466511338007, + "grad_norm": 5.964495658874512, + "learning_rate": 1.5022555814436654e-05, + "loss": 2.6286, + "step": 6751000 + }, + { + "epoch": 2.0988020834142875, + "grad_norm": 9.763976097106934, + "learning_rate": 1.5019965276428541e-05, + "loss": 2.6228, + "step": 6751500 + }, + { + "epoch": 2.0989575156947744, + "grad_norm": 15.378582000732422, + "learning_rate": 1.5017374738420425e-05, + "loss": 2.7008, + "step": 6752000 + }, + { + "epoch": 2.0991129479752613, + "grad_norm": 9.983139038085938, + "learning_rate": 1.501478420041231e-05, + "loss": 2.6471, + "step": 6752500 + }, + { + "epoch": 2.099268380255748, + "grad_norm": 11.846339225769043, + "learning_rate": 1.5012193662404194e-05, + "loss": 2.7097, + "step": 6753000 + }, + { + "epoch": 2.099423812536235, + "grad_norm": 10.661768913269043, + "learning_rate": 1.5009603124396083e-05, + "loss": 2.6753, + "step": 6753500 + }, + { + "epoch": 2.099579244816722, + "grad_norm": 12.322308540344238, + "learning_rate": 1.5007012586387967e-05, + "loss": 2.6487, + "step": 6754000 + }, + { + "epoch": 2.0997346770972087, + "grad_norm": 17.09022331237793, + "learning_rate": 1.5004422048379852e-05, + "loss": 2.6612, + "step": 6754500 + }, + { + "epoch": 2.0998901093776956, + "grad_norm": 8.908984184265137, + "learning_rate": 1.500183151037174e-05, + "loss": 2.7155, + "step": 6755000 + }, + { + "epoch": 2.1000455416581825, + "grad_norm": 10.452458381652832, + "learning_rate": 1.4999240972363623e-05, + "loss": 2.6316, + "step": 6755500 + }, + { + "epoch": 2.1002009739386693, + "grad_norm": 11.838159561157227, + "learning_rate": 1.4996650434355509e-05, + "loss": 2.6439, + "step": 6756000 + }, + { + "epoch": 2.100356406219156, + "grad_norm": 12.50634765625, + "learning_rate": 1.4994059896347392e-05, + "loss": 2.65, + "step": 6756500 + }, + { + "epoch": 2.100511838499643, + "grad_norm": 10.430395126342773, + "learning_rate": 1.499146935833928e-05, + "loss": 2.6499, + "step": 6757000 + }, + { + "epoch": 2.1006672707801304, + "grad_norm": 28.90789222717285, + "learning_rate": 1.4988878820331165e-05, + "loss": 2.7084, + "step": 6757500 + }, + { + "epoch": 2.1008227030606172, + "grad_norm": 9.398778915405273, + "learning_rate": 1.4986288282323049e-05, + "loss": 2.6954, + "step": 6758000 + }, + { + "epoch": 2.100978135341104, + "grad_norm": 17.292083740234375, + "learning_rate": 1.4983697744314938e-05, + "loss": 2.6459, + "step": 6758500 + }, + { + "epoch": 2.101133567621591, + "grad_norm": 10.84003734588623, + "learning_rate": 1.4981107206306821e-05, + "loss": 2.6713, + "step": 6759000 + }, + { + "epoch": 2.101288999902078, + "grad_norm": 9.947105407714844, + "learning_rate": 1.4978516668298707e-05, + "loss": 2.6805, + "step": 6759500 + }, + { + "epoch": 2.1014444321825647, + "grad_norm": 10.112781524658203, + "learning_rate": 1.497592613029059e-05, + "loss": 2.6378, + "step": 6760000 + }, + { + "epoch": 2.1015998644630516, + "grad_norm": 13.036643028259277, + "learning_rate": 1.4973335592282478e-05, + "loss": 2.7337, + "step": 6760500 + }, + { + "epoch": 2.1017552967435384, + "grad_norm": 10.554023742675781, + "learning_rate": 1.4970745054274363e-05, + "loss": 2.6583, + "step": 6761000 + }, + { + "epoch": 2.1019107290240253, + "grad_norm": 12.244157791137695, + "learning_rate": 1.4968154516266247e-05, + "loss": 2.6833, + "step": 6761500 + }, + { + "epoch": 2.102066161304512, + "grad_norm": 8.776948928833008, + "learning_rate": 1.4965563978258132e-05, + "loss": 2.6669, + "step": 6762000 + }, + { + "epoch": 2.102221593584999, + "grad_norm": 7.224420547485352, + "learning_rate": 1.496297344025002e-05, + "loss": 2.6442, + "step": 6762500 + }, + { + "epoch": 2.102377025865486, + "grad_norm": 13.284021377563477, + "learning_rate": 1.4960382902241903e-05, + "loss": 2.6682, + "step": 6763000 + }, + { + "epoch": 2.102532458145973, + "grad_norm": 8.146631240844727, + "learning_rate": 1.4957792364233789e-05, + "loss": 2.6821, + "step": 6763500 + }, + { + "epoch": 2.1026878904264596, + "grad_norm": 9.155027389526367, + "learning_rate": 1.4955201826225676e-05, + "loss": 2.7104, + "step": 6764000 + }, + { + "epoch": 2.1028433227069465, + "grad_norm": 13.94201946258545, + "learning_rate": 1.4952611288217561e-05, + "loss": 2.6201, + "step": 6764500 + }, + { + "epoch": 2.1029987549874334, + "grad_norm": 11.779641151428223, + "learning_rate": 1.4950020750209445e-05, + "loss": 2.6907, + "step": 6765000 + }, + { + "epoch": 2.1031541872679202, + "grad_norm": 8.565489768981934, + "learning_rate": 1.494743021220133e-05, + "loss": 2.7059, + "step": 6765500 + }, + { + "epoch": 2.103309619548407, + "grad_norm": 9.572832107543945, + "learning_rate": 1.4944839674193218e-05, + "loss": 2.6659, + "step": 6766000 + }, + { + "epoch": 2.103465051828894, + "grad_norm": 8.414390563964844, + "learning_rate": 1.4942249136185101e-05, + "loss": 2.708, + "step": 6766500 + }, + { + "epoch": 2.103620484109381, + "grad_norm": 9.007536888122559, + "learning_rate": 1.4939658598176987e-05, + "loss": 2.6958, + "step": 6767000 + }, + { + "epoch": 2.1037759163898677, + "grad_norm": 9.946087837219238, + "learning_rate": 1.4937068060168874e-05, + "loss": 2.6749, + "step": 6767500 + }, + { + "epoch": 2.1039313486703546, + "grad_norm": 9.133193016052246, + "learning_rate": 1.4934477522160758e-05, + "loss": 2.6999, + "step": 6768000 + }, + { + "epoch": 2.1040867809508415, + "grad_norm": 22.553030014038086, + "learning_rate": 1.4931886984152643e-05, + "loss": 2.6514, + "step": 6768500 + }, + { + "epoch": 2.1042422132313283, + "grad_norm": 8.5250825881958, + "learning_rate": 1.4929296446144527e-05, + "loss": 2.6177, + "step": 6769000 + }, + { + "epoch": 2.104397645511815, + "grad_norm": 10.054621696472168, + "learning_rate": 1.4926705908136416e-05, + "loss": 2.6686, + "step": 6769500 + }, + { + "epoch": 2.104553077792302, + "grad_norm": 9.763848304748535, + "learning_rate": 1.49241153701283e-05, + "loss": 2.6576, + "step": 6770000 + }, + { + "epoch": 2.104708510072789, + "grad_norm": 8.239717483520508, + "learning_rate": 1.4921524832120185e-05, + "loss": 2.7137, + "step": 6770500 + }, + { + "epoch": 2.104863942353276, + "grad_norm": 12.438403129577637, + "learning_rate": 1.4918934294112072e-05, + "loss": 2.7511, + "step": 6771000 + }, + { + "epoch": 2.1050193746337627, + "grad_norm": 9.46337890625, + "learning_rate": 1.4916343756103956e-05, + "loss": 2.6645, + "step": 6771500 + }, + { + "epoch": 2.1051748069142495, + "grad_norm": 12.567031860351562, + "learning_rate": 1.4913753218095842e-05, + "loss": 2.6623, + "step": 6772000 + }, + { + "epoch": 2.1053302391947364, + "grad_norm": 10.832280158996582, + "learning_rate": 1.4911162680087725e-05, + "loss": 2.6805, + "step": 6772500 + }, + { + "epoch": 2.1054856714752233, + "grad_norm": 10.716951370239258, + "learning_rate": 1.4908572142079612e-05, + "loss": 2.6378, + "step": 6773000 + }, + { + "epoch": 2.10564110375571, + "grad_norm": 8.780759811401367, + "learning_rate": 1.4905981604071498e-05, + "loss": 2.6622, + "step": 6773500 + }, + { + "epoch": 2.105796536036197, + "grad_norm": 23.471044540405273, + "learning_rate": 1.4903391066063382e-05, + "loss": 2.7071, + "step": 6774000 + }, + { + "epoch": 2.105951968316684, + "grad_norm": 9.471562385559082, + "learning_rate": 1.4900800528055267e-05, + "loss": 2.7166, + "step": 6774500 + }, + { + "epoch": 2.1061074005971707, + "grad_norm": 10.965779304504395, + "learning_rate": 1.4898209990047154e-05, + "loss": 2.7014, + "step": 6775000 + }, + { + "epoch": 2.1062628328776576, + "grad_norm": 16.404727935791016, + "learning_rate": 1.489561945203904e-05, + "loss": 2.7021, + "step": 6775500 + }, + { + "epoch": 2.1064182651581445, + "grad_norm": 14.661484718322754, + "learning_rate": 1.4893028914030924e-05, + "loss": 2.6589, + "step": 6776000 + }, + { + "epoch": 2.1065736974386313, + "grad_norm": 9.638945579528809, + "learning_rate": 1.489043837602281e-05, + "loss": 2.642, + "step": 6776500 + }, + { + "epoch": 2.106729129719118, + "grad_norm": 17.52271270751953, + "learning_rate": 1.4887847838014696e-05, + "loss": 2.6239, + "step": 6777000 + }, + { + "epoch": 2.106884561999605, + "grad_norm": 7.975070476531982, + "learning_rate": 1.488525730000658e-05, + "loss": 2.655, + "step": 6777500 + }, + { + "epoch": 2.107039994280092, + "grad_norm": 9.254791259765625, + "learning_rate": 1.4882666761998465e-05, + "loss": 2.6789, + "step": 6778000 + }, + { + "epoch": 2.107195426560579, + "grad_norm": 13.795071601867676, + "learning_rate": 1.4880076223990352e-05, + "loss": 2.6705, + "step": 6778500 + }, + { + "epoch": 2.1073508588410657, + "grad_norm": 8.630692481994629, + "learning_rate": 1.4877485685982236e-05, + "loss": 2.6886, + "step": 6779000 + }, + { + "epoch": 2.1075062911215525, + "grad_norm": 26.559288024902344, + "learning_rate": 1.4874895147974122e-05, + "loss": 2.6647, + "step": 6779500 + }, + { + "epoch": 2.1076617234020394, + "grad_norm": 9.442937850952148, + "learning_rate": 1.4872304609966009e-05, + "loss": 2.7018, + "step": 6780000 + }, + { + "epoch": 2.1078171556825263, + "grad_norm": 12.679245948791504, + "learning_rate": 1.4869714071957894e-05, + "loss": 2.684, + "step": 6780500 + }, + { + "epoch": 2.107972587963013, + "grad_norm": 19.818323135375977, + "learning_rate": 1.4867123533949778e-05, + "loss": 2.6857, + "step": 6781000 + }, + { + "epoch": 2.1081280202435, + "grad_norm": 9.33795166015625, + "learning_rate": 1.4864532995941664e-05, + "loss": 2.656, + "step": 6781500 + }, + { + "epoch": 2.1082834525239873, + "grad_norm": 10.142775535583496, + "learning_rate": 1.486194245793355e-05, + "loss": 2.6678, + "step": 6782000 + }, + { + "epoch": 2.108438884804474, + "grad_norm": 7.073610782623291, + "learning_rate": 1.4859351919925434e-05, + "loss": 2.6633, + "step": 6782500 + }, + { + "epoch": 2.108594317084961, + "grad_norm": 12.663674354553223, + "learning_rate": 1.485676138191732e-05, + "loss": 2.667, + "step": 6783000 + }, + { + "epoch": 2.108749749365448, + "grad_norm": 10.117918014526367, + "learning_rate": 1.4854170843909204e-05, + "loss": 2.6446, + "step": 6783500 + }, + { + "epoch": 2.1089051816459348, + "grad_norm": 9.808725357055664, + "learning_rate": 1.4851580305901091e-05, + "loss": 2.6606, + "step": 6784000 + }, + { + "epoch": 2.1090606139264216, + "grad_norm": 10.916411399841309, + "learning_rate": 1.4848989767892976e-05, + "loss": 2.6984, + "step": 6784500 + }, + { + "epoch": 2.1092160462069085, + "grad_norm": 9.678910255432129, + "learning_rate": 1.484639922988486e-05, + "loss": 2.6278, + "step": 6785000 + }, + { + "epoch": 2.1093714784873954, + "grad_norm": 8.470498085021973, + "learning_rate": 1.4843808691876749e-05, + "loss": 2.7186, + "step": 6785500 + }, + { + "epoch": 2.1095269107678822, + "grad_norm": 8.615066528320312, + "learning_rate": 1.4841218153868633e-05, + "loss": 2.6717, + "step": 6786000 + }, + { + "epoch": 2.109682343048369, + "grad_norm": 10.22304630279541, + "learning_rate": 1.4838627615860518e-05, + "loss": 2.6719, + "step": 6786500 + }, + { + "epoch": 2.109837775328856, + "grad_norm": 19.760135650634766, + "learning_rate": 1.4836037077852402e-05, + "loss": 2.6553, + "step": 6787000 + }, + { + "epoch": 2.109993207609343, + "grad_norm": 11.4933443069458, + "learning_rate": 1.4833446539844289e-05, + "loss": 2.6795, + "step": 6787500 + }, + { + "epoch": 2.1101486398898297, + "grad_norm": 10.00850772857666, + "learning_rate": 1.4830856001836175e-05, + "loss": 2.7004, + "step": 6788000 + }, + { + "epoch": 2.1103040721703166, + "grad_norm": 9.19393539428711, + "learning_rate": 1.4828265463828058e-05, + "loss": 2.6063, + "step": 6788500 + }, + { + "epoch": 2.1104595044508034, + "grad_norm": 9.766497611999512, + "learning_rate": 1.4825674925819945e-05, + "loss": 2.6684, + "step": 6789000 + }, + { + "epoch": 2.1106149367312903, + "grad_norm": 10.892489433288574, + "learning_rate": 1.4823084387811831e-05, + "loss": 2.7095, + "step": 6789500 + }, + { + "epoch": 2.110770369011777, + "grad_norm": 9.992895126342773, + "learning_rate": 1.4820493849803715e-05, + "loss": 2.7167, + "step": 6790000 + }, + { + "epoch": 2.110925801292264, + "grad_norm": 9.919817924499512, + "learning_rate": 1.48179033117956e-05, + "loss": 2.7053, + "step": 6790500 + }, + { + "epoch": 2.111081233572751, + "grad_norm": 11.861698150634766, + "learning_rate": 1.4815312773787487e-05, + "loss": 2.6867, + "step": 6791000 + }, + { + "epoch": 2.1112366658532378, + "grad_norm": 8.733389854431152, + "learning_rate": 1.4812722235779373e-05, + "loss": 2.6437, + "step": 6791500 + }, + { + "epoch": 2.1113920981337246, + "grad_norm": 15.788026809692383, + "learning_rate": 1.4810131697771257e-05, + "loss": 2.6694, + "step": 6792000 + }, + { + "epoch": 2.1115475304142115, + "grad_norm": 11.782830238342285, + "learning_rate": 1.4807541159763142e-05, + "loss": 2.668, + "step": 6792500 + }, + { + "epoch": 2.1117029626946984, + "grad_norm": 9.772082328796387, + "learning_rate": 1.4804950621755029e-05, + "loss": 2.649, + "step": 6793000 + }, + { + "epoch": 2.1118583949751852, + "grad_norm": 9.14920711517334, + "learning_rate": 1.4802360083746913e-05, + "loss": 2.6479, + "step": 6793500 + }, + { + "epoch": 2.112013827255672, + "grad_norm": 16.93979835510254, + "learning_rate": 1.4799769545738798e-05, + "loss": 2.677, + "step": 6794000 + }, + { + "epoch": 2.112169259536159, + "grad_norm": 8.539338111877441, + "learning_rate": 1.4797179007730686e-05, + "loss": 2.6602, + "step": 6794500 + }, + { + "epoch": 2.112324691816646, + "grad_norm": 13.893122673034668, + "learning_rate": 1.479458846972257e-05, + "loss": 2.6462, + "step": 6795000 + }, + { + "epoch": 2.1124801240971327, + "grad_norm": 9.502535820007324, + "learning_rate": 1.4791997931714455e-05, + "loss": 2.6577, + "step": 6795500 + }, + { + "epoch": 2.1126355563776196, + "grad_norm": 21.096023559570312, + "learning_rate": 1.4789407393706338e-05, + "loss": 2.6363, + "step": 6796000 + }, + { + "epoch": 2.1127909886581064, + "grad_norm": 12.110322952270508, + "learning_rate": 1.4786816855698227e-05, + "loss": 2.6254, + "step": 6796500 + }, + { + "epoch": 2.1129464209385933, + "grad_norm": 9.030158996582031, + "learning_rate": 1.4784226317690111e-05, + "loss": 2.6893, + "step": 6797000 + }, + { + "epoch": 2.11310185321908, + "grad_norm": 9.834775924682617, + "learning_rate": 1.4781635779681997e-05, + "loss": 2.7045, + "step": 6797500 + }, + { + "epoch": 2.113257285499567, + "grad_norm": 9.582098007202148, + "learning_rate": 1.4779045241673884e-05, + "loss": 2.658, + "step": 6798000 + }, + { + "epoch": 2.113412717780054, + "grad_norm": 11.863574028015137, + "learning_rate": 1.4776454703665767e-05, + "loss": 2.6691, + "step": 6798500 + }, + { + "epoch": 2.1135681500605408, + "grad_norm": 11.261332511901855, + "learning_rate": 1.4773864165657653e-05, + "loss": 2.6536, + "step": 6799000 + }, + { + "epoch": 2.1137235823410276, + "grad_norm": 16.010908126831055, + "learning_rate": 1.4771273627649537e-05, + "loss": 2.6753, + "step": 6799500 + }, + { + "epoch": 2.1138790146215145, + "grad_norm": 10.657952308654785, + "learning_rate": 1.4768683089641424e-05, + "loss": 2.6782, + "step": 6800000 + }, + { + "epoch": 2.1140344469020014, + "grad_norm": 16.72688102722168, + "learning_rate": 1.476609255163331e-05, + "loss": 2.5851, + "step": 6800500 + }, + { + "epoch": 2.1141898791824882, + "grad_norm": 13.236336708068848, + "learning_rate": 1.4763502013625193e-05, + "loss": 2.6871, + "step": 6801000 + }, + { + "epoch": 2.114345311462975, + "grad_norm": 9.424922943115234, + "learning_rate": 1.4760911475617079e-05, + "loss": 2.6548, + "step": 6801500 + }, + { + "epoch": 2.114500743743462, + "grad_norm": 10.419600486755371, + "learning_rate": 1.4758320937608966e-05, + "loss": 2.6512, + "step": 6802000 + }, + { + "epoch": 2.114656176023949, + "grad_norm": 8.650710105895996, + "learning_rate": 1.4755730399600851e-05, + "loss": 2.6835, + "step": 6802500 + }, + { + "epoch": 2.1148116083044357, + "grad_norm": 7.233778953552246, + "learning_rate": 1.4753139861592735e-05, + "loss": 2.6252, + "step": 6803000 + }, + { + "epoch": 2.1149670405849226, + "grad_norm": 9.224433898925781, + "learning_rate": 1.4750549323584622e-05, + "loss": 2.6665, + "step": 6803500 + }, + { + "epoch": 2.1151224728654094, + "grad_norm": 8.304932594299316, + "learning_rate": 1.4747958785576508e-05, + "loss": 2.6823, + "step": 6804000 + }, + { + "epoch": 2.1152779051458963, + "grad_norm": 11.183974266052246, + "learning_rate": 1.4745368247568391e-05, + "loss": 2.6794, + "step": 6804500 + }, + { + "epoch": 2.115433337426383, + "grad_norm": 11.034014701843262, + "learning_rate": 1.4742777709560277e-05, + "loss": 2.6343, + "step": 6805000 + }, + { + "epoch": 2.1155887697068705, + "grad_norm": 7.067988872528076, + "learning_rate": 1.4740187171552164e-05, + "loss": 2.7016, + "step": 6805500 + }, + { + "epoch": 2.115744201987357, + "grad_norm": 9.4258394241333, + "learning_rate": 1.4737596633544048e-05, + "loss": 2.6442, + "step": 6806000 + }, + { + "epoch": 2.115899634267844, + "grad_norm": 11.045641899108887, + "learning_rate": 1.4735006095535933e-05, + "loss": 2.6803, + "step": 6806500 + }, + { + "epoch": 2.116055066548331, + "grad_norm": 11.535696983337402, + "learning_rate": 1.473241555752782e-05, + "loss": 2.6871, + "step": 6807000 + }, + { + "epoch": 2.116210498828818, + "grad_norm": 11.828169822692871, + "learning_rate": 1.4729825019519706e-05, + "loss": 2.6821, + "step": 6807500 + }, + { + "epoch": 2.116365931109305, + "grad_norm": 15.159358978271484, + "learning_rate": 1.472723448151159e-05, + "loss": 2.6565, + "step": 6808000 + }, + { + "epoch": 2.1165213633897917, + "grad_norm": 11.015568733215332, + "learning_rate": 1.4724643943503475e-05, + "loss": 2.7087, + "step": 6808500 + }, + { + "epoch": 2.1166767956702786, + "grad_norm": 9.61977481842041, + "learning_rate": 1.4722053405495362e-05, + "loss": 2.6631, + "step": 6809000 + }, + { + "epoch": 2.1168322279507654, + "grad_norm": 18.86427116394043, + "learning_rate": 1.4719462867487246e-05, + "loss": 2.6601, + "step": 6809500 + }, + { + "epoch": 2.1169876602312523, + "grad_norm": 9.905525207519531, + "learning_rate": 1.4716872329479131e-05, + "loss": 2.639, + "step": 6810000 + }, + { + "epoch": 2.117143092511739, + "grad_norm": 10.416229248046875, + "learning_rate": 1.4714281791471015e-05, + "loss": 2.6639, + "step": 6810500 + }, + { + "epoch": 2.117298524792226, + "grad_norm": 9.223343849182129, + "learning_rate": 1.4711691253462902e-05, + "loss": 2.6537, + "step": 6811000 + }, + { + "epoch": 2.117453957072713, + "grad_norm": 10.753021240234375, + "learning_rate": 1.4709100715454788e-05, + "loss": 2.6796, + "step": 6811500 + }, + { + "epoch": 2.1176093893531998, + "grad_norm": 11.2338228225708, + "learning_rate": 1.4706510177446673e-05, + "loss": 2.6585, + "step": 6812000 + }, + { + "epoch": 2.1177648216336866, + "grad_norm": 12.329392433166504, + "learning_rate": 1.470391963943856e-05, + "loss": 2.6961, + "step": 6812500 + }, + { + "epoch": 2.1179202539141735, + "grad_norm": 59.178585052490234, + "learning_rate": 1.4701329101430444e-05, + "loss": 2.6949, + "step": 6813000 + }, + { + "epoch": 2.1180756861946604, + "grad_norm": 8.835049629211426, + "learning_rate": 1.469873856342233e-05, + "loss": 2.6451, + "step": 6813500 + }, + { + "epoch": 2.1182311184751472, + "grad_norm": 13.871731758117676, + "learning_rate": 1.4696148025414213e-05, + "loss": 2.6775, + "step": 6814000 + }, + { + "epoch": 2.118386550755634, + "grad_norm": 10.770038604736328, + "learning_rate": 1.46935574874061e-05, + "loss": 2.6451, + "step": 6814500 + }, + { + "epoch": 2.118541983036121, + "grad_norm": 9.333625793457031, + "learning_rate": 1.4690966949397986e-05, + "loss": 2.6771, + "step": 6815000 + }, + { + "epoch": 2.118697415316608, + "grad_norm": 9.313281059265137, + "learning_rate": 1.468837641138987e-05, + "loss": 2.7102, + "step": 6815500 + }, + { + "epoch": 2.1188528475970947, + "grad_norm": 17.623517990112305, + "learning_rate": 1.4685785873381757e-05, + "loss": 2.664, + "step": 6816000 + }, + { + "epoch": 2.1190082798775816, + "grad_norm": 9.967928886413574, + "learning_rate": 1.4683195335373642e-05, + "loss": 2.6777, + "step": 6816500 + }, + { + "epoch": 2.1191637121580684, + "grad_norm": 8.609987258911133, + "learning_rate": 1.4680604797365528e-05, + "loss": 2.6846, + "step": 6817000 + }, + { + "epoch": 2.1193191444385553, + "grad_norm": 11.101632118225098, + "learning_rate": 1.4678014259357412e-05, + "loss": 2.6872, + "step": 6817500 + }, + { + "epoch": 2.119474576719042, + "grad_norm": 9.473284721374512, + "learning_rate": 1.4675423721349299e-05, + "loss": 2.6633, + "step": 6818000 + }, + { + "epoch": 2.119630008999529, + "grad_norm": 14.051163673400879, + "learning_rate": 1.4672833183341184e-05, + "loss": 2.6578, + "step": 6818500 + }, + { + "epoch": 2.119785441280016, + "grad_norm": 8.23104476928711, + "learning_rate": 1.4670242645333068e-05, + "loss": 2.6406, + "step": 6819000 + }, + { + "epoch": 2.1199408735605028, + "grad_norm": 9.832818031311035, + "learning_rate": 1.4667652107324953e-05, + "loss": 2.6864, + "step": 6819500 + }, + { + "epoch": 2.1200963058409896, + "grad_norm": 10.420619010925293, + "learning_rate": 1.466506156931684e-05, + "loss": 2.6965, + "step": 6820000 + }, + { + "epoch": 2.1202517381214765, + "grad_norm": 9.743550300598145, + "learning_rate": 1.4662471031308724e-05, + "loss": 2.6754, + "step": 6820500 + }, + { + "epoch": 2.1204071704019634, + "grad_norm": 10.859271049499512, + "learning_rate": 1.465988049330061e-05, + "loss": 2.6529, + "step": 6821000 + }, + { + "epoch": 2.1205626026824502, + "grad_norm": 10.82365608215332, + "learning_rate": 1.4657289955292497e-05, + "loss": 2.699, + "step": 6821500 + }, + { + "epoch": 2.120718034962937, + "grad_norm": 7.445333003997803, + "learning_rate": 1.4654699417284382e-05, + "loss": 2.6798, + "step": 6822000 + }, + { + "epoch": 2.120873467243424, + "grad_norm": 8.455512046813965, + "learning_rate": 1.4652108879276266e-05, + "loss": 2.6291, + "step": 6822500 + }, + { + "epoch": 2.121028899523911, + "grad_norm": 10.599400520324707, + "learning_rate": 1.4649518341268152e-05, + "loss": 2.6622, + "step": 6823000 + }, + { + "epoch": 2.1211843318043977, + "grad_norm": 33.871212005615234, + "learning_rate": 1.4646927803260039e-05, + "loss": 2.6966, + "step": 6823500 + }, + { + "epoch": 2.1213397640848846, + "grad_norm": 9.9841947555542, + "learning_rate": 1.4644337265251923e-05, + "loss": 2.7017, + "step": 6824000 + }, + { + "epoch": 2.1214951963653714, + "grad_norm": 11.556727409362793, + "learning_rate": 1.4641746727243808e-05, + "loss": 2.6788, + "step": 6824500 + }, + { + "epoch": 2.1216506286458583, + "grad_norm": 9.10141658782959, + "learning_rate": 1.4639156189235695e-05, + "loss": 2.6433, + "step": 6825000 + }, + { + "epoch": 2.121806060926345, + "grad_norm": 10.239809036254883, + "learning_rate": 1.4636565651227579e-05, + "loss": 2.6388, + "step": 6825500 + }, + { + "epoch": 2.121961493206832, + "grad_norm": 19.013504028320312, + "learning_rate": 1.4633975113219464e-05, + "loss": 2.67, + "step": 6826000 + }, + { + "epoch": 2.122116925487319, + "grad_norm": 11.85987377166748, + "learning_rate": 1.4631384575211348e-05, + "loss": 2.6743, + "step": 6826500 + }, + { + "epoch": 2.1222723577678058, + "grad_norm": 10.34070873260498, + "learning_rate": 1.4628794037203237e-05, + "loss": 2.7008, + "step": 6827000 + }, + { + "epoch": 2.1224277900482926, + "grad_norm": 9.555188179016113, + "learning_rate": 1.462620349919512e-05, + "loss": 2.6693, + "step": 6827500 + }, + { + "epoch": 2.1225832223287795, + "grad_norm": 13.680719375610352, + "learning_rate": 1.4623612961187006e-05, + "loss": 2.6925, + "step": 6828000 + }, + { + "epoch": 2.1227386546092664, + "grad_norm": 8.269834518432617, + "learning_rate": 1.462102242317889e-05, + "loss": 2.6844, + "step": 6828500 + }, + { + "epoch": 2.1228940868897532, + "grad_norm": 12.498739242553711, + "learning_rate": 1.4618431885170777e-05, + "loss": 2.7059, + "step": 6829000 + }, + { + "epoch": 2.12304951917024, + "grad_norm": 12.68932056427002, + "learning_rate": 1.4615841347162663e-05, + "loss": 2.6558, + "step": 6829500 + }, + { + "epoch": 2.1232049514507274, + "grad_norm": 9.844047546386719, + "learning_rate": 1.4613250809154546e-05, + "loss": 2.6411, + "step": 6830000 + }, + { + "epoch": 2.1233603837312143, + "grad_norm": 11.454084396362305, + "learning_rate": 1.4610660271146433e-05, + "loss": 2.6439, + "step": 6830500 + }, + { + "epoch": 2.123515816011701, + "grad_norm": 50.86468505859375, + "learning_rate": 1.4608069733138319e-05, + "loss": 2.6784, + "step": 6831000 + }, + { + "epoch": 2.123671248292188, + "grad_norm": 10.001214981079102, + "learning_rate": 1.4605479195130203e-05, + "loss": 2.6149, + "step": 6831500 + }, + { + "epoch": 2.123826680572675, + "grad_norm": 10.393287658691406, + "learning_rate": 1.4602888657122088e-05, + "loss": 2.6813, + "step": 6832000 + }, + { + "epoch": 2.1239821128531617, + "grad_norm": 9.243529319763184, + "learning_rate": 1.4600298119113975e-05, + "loss": 2.663, + "step": 6832500 + }, + { + "epoch": 2.1241375451336486, + "grad_norm": 7.943065643310547, + "learning_rate": 1.459770758110586e-05, + "loss": 2.5994, + "step": 6833000 + }, + { + "epoch": 2.1242929774141355, + "grad_norm": 11.63392448425293, + "learning_rate": 1.4595117043097745e-05, + "loss": 2.6819, + "step": 6833500 + }, + { + "epoch": 2.1244484096946223, + "grad_norm": 14.050796508789062, + "learning_rate": 1.4592526505089632e-05, + "loss": 2.6517, + "step": 6834000 + }, + { + "epoch": 2.124603841975109, + "grad_norm": 11.421100616455078, + "learning_rate": 1.4589935967081517e-05, + "loss": 2.6509, + "step": 6834500 + }, + { + "epoch": 2.124759274255596, + "grad_norm": 11.293279647827148, + "learning_rate": 1.4587345429073401e-05, + "loss": 2.6615, + "step": 6835000 + }, + { + "epoch": 2.124914706536083, + "grad_norm": 8.876328468322754, + "learning_rate": 1.4584754891065286e-05, + "loss": 2.6517, + "step": 6835500 + }, + { + "epoch": 2.12507013881657, + "grad_norm": 10.45811939239502, + "learning_rate": 1.4582164353057174e-05, + "loss": 2.6516, + "step": 6836000 + }, + { + "epoch": 2.1252255710970567, + "grad_norm": 8.639139175415039, + "learning_rate": 1.4579573815049057e-05, + "loss": 2.6356, + "step": 6836500 + }, + { + "epoch": 2.1253810033775435, + "grad_norm": 11.109100341796875, + "learning_rate": 1.4576983277040943e-05, + "loss": 2.6372, + "step": 6837000 + }, + { + "epoch": 2.1255364356580304, + "grad_norm": 11.27358627319336, + "learning_rate": 1.4574392739032827e-05, + "loss": 2.6529, + "step": 6837500 + }, + { + "epoch": 2.1256918679385173, + "grad_norm": 13.15208911895752, + "learning_rate": 1.4571802201024715e-05, + "loss": 2.6926, + "step": 6838000 + }, + { + "epoch": 2.125847300219004, + "grad_norm": 11.506208419799805, + "learning_rate": 1.4569211663016599e-05, + "loss": 2.6544, + "step": 6838500 + }, + { + "epoch": 2.126002732499491, + "grad_norm": 9.136913299560547, + "learning_rate": 1.4566621125008485e-05, + "loss": 2.6976, + "step": 6839000 + }, + { + "epoch": 2.126158164779978, + "grad_norm": 8.879674911499023, + "learning_rate": 1.4564030587000372e-05, + "loss": 2.6721, + "step": 6839500 + }, + { + "epoch": 2.1263135970604647, + "grad_norm": 8.536579132080078, + "learning_rate": 1.4561440048992256e-05, + "loss": 2.6465, + "step": 6840000 + }, + { + "epoch": 2.1264690293409516, + "grad_norm": 14.545465469360352, + "learning_rate": 1.4558849510984141e-05, + "loss": 2.6996, + "step": 6840500 + }, + { + "epoch": 2.1266244616214385, + "grad_norm": 11.808341026306152, + "learning_rate": 1.4556258972976025e-05, + "loss": 2.6263, + "step": 6841000 + }, + { + "epoch": 2.1267798939019253, + "grad_norm": 9.676928520202637, + "learning_rate": 1.4553668434967912e-05, + "loss": 2.6352, + "step": 6841500 + }, + { + "epoch": 2.126935326182412, + "grad_norm": 12.174174308776855, + "learning_rate": 1.4551077896959797e-05, + "loss": 2.6374, + "step": 6842000 + }, + { + "epoch": 2.127090758462899, + "grad_norm": 12.05647087097168, + "learning_rate": 1.4548487358951681e-05, + "loss": 2.6552, + "step": 6842500 + }, + { + "epoch": 2.127246190743386, + "grad_norm": 10.109627723693848, + "learning_rate": 1.454589682094357e-05, + "loss": 2.7248, + "step": 6843000 + }, + { + "epoch": 2.127401623023873, + "grad_norm": 7.510142803192139, + "learning_rate": 1.4543306282935454e-05, + "loss": 2.6777, + "step": 6843500 + }, + { + "epoch": 2.1275570553043597, + "grad_norm": 9.449694633483887, + "learning_rate": 1.454071574492734e-05, + "loss": 2.6658, + "step": 6844000 + }, + { + "epoch": 2.1277124875848465, + "grad_norm": 8.97730827331543, + "learning_rate": 1.4538125206919223e-05, + "loss": 2.7013, + "step": 6844500 + }, + { + "epoch": 2.1278679198653334, + "grad_norm": 9.229723930358887, + "learning_rate": 1.453553466891111e-05, + "loss": 2.6516, + "step": 6845000 + }, + { + "epoch": 2.1280233521458203, + "grad_norm": 9.45975112915039, + "learning_rate": 1.4532944130902996e-05, + "loss": 2.6649, + "step": 6845500 + }, + { + "epoch": 2.128178784426307, + "grad_norm": 7.535940170288086, + "learning_rate": 1.453035359289488e-05, + "loss": 2.6351, + "step": 6846000 + }, + { + "epoch": 2.128334216706794, + "grad_norm": 9.746220588684082, + "learning_rate": 1.4527763054886765e-05, + "loss": 2.6658, + "step": 6846500 + }, + { + "epoch": 2.128489648987281, + "grad_norm": 9.206692695617676, + "learning_rate": 1.4525172516878652e-05, + "loss": 2.6801, + "step": 6847000 + }, + { + "epoch": 2.1286450812677677, + "grad_norm": 9.958181381225586, + "learning_rate": 1.4522581978870536e-05, + "loss": 2.6617, + "step": 6847500 + }, + { + "epoch": 2.1288005135482546, + "grad_norm": 9.391329765319824, + "learning_rate": 1.4519991440862421e-05, + "loss": 2.6757, + "step": 6848000 + }, + { + "epoch": 2.1289559458287415, + "grad_norm": 8.36530590057373, + "learning_rate": 1.4517400902854308e-05, + "loss": 2.66, + "step": 6848500 + }, + { + "epoch": 2.1291113781092283, + "grad_norm": 11.041136741638184, + "learning_rate": 1.4514810364846194e-05, + "loss": 2.6542, + "step": 6849000 + }, + { + "epoch": 2.129266810389715, + "grad_norm": 10.612936019897461, + "learning_rate": 1.4512219826838078e-05, + "loss": 2.701, + "step": 6849500 + }, + { + "epoch": 2.129422242670202, + "grad_norm": 14.697319030761719, + "learning_rate": 1.4509629288829963e-05, + "loss": 2.6755, + "step": 6850000 + }, + { + "epoch": 2.129577674950689, + "grad_norm": 9.430266380310059, + "learning_rate": 1.450703875082185e-05, + "loss": 2.6761, + "step": 6850500 + }, + { + "epoch": 2.129733107231176, + "grad_norm": 21.615751266479492, + "learning_rate": 1.4504448212813734e-05, + "loss": 2.6993, + "step": 6851000 + }, + { + "epoch": 2.1298885395116627, + "grad_norm": 9.278876304626465, + "learning_rate": 1.450185767480562e-05, + "loss": 2.6785, + "step": 6851500 + }, + { + "epoch": 2.1300439717921495, + "grad_norm": 10.087145805358887, + "learning_rate": 1.4499267136797507e-05, + "loss": 2.6796, + "step": 6852000 + }, + { + "epoch": 2.1301994040726364, + "grad_norm": 28.391752243041992, + "learning_rate": 1.449667659878939e-05, + "loss": 2.6314, + "step": 6852500 + }, + { + "epoch": 2.1303548363531233, + "grad_norm": 8.158966064453125, + "learning_rate": 1.4494086060781276e-05, + "loss": 2.6812, + "step": 6853000 + }, + { + "epoch": 2.1305102686336106, + "grad_norm": 11.304254531860352, + "learning_rate": 1.449149552277316e-05, + "loss": 2.6734, + "step": 6853500 + }, + { + "epoch": 2.130665700914097, + "grad_norm": 9.670421600341797, + "learning_rate": 1.4488904984765048e-05, + "loss": 2.6516, + "step": 6854000 + }, + { + "epoch": 2.1308211331945843, + "grad_norm": 9.468767166137695, + "learning_rate": 1.4486314446756932e-05, + "loss": 2.6659, + "step": 6854500 + }, + { + "epoch": 2.130976565475071, + "grad_norm": 6.681613922119141, + "learning_rate": 1.4483723908748818e-05, + "loss": 2.6494, + "step": 6855000 + }, + { + "epoch": 2.131131997755558, + "grad_norm": 6.283239841461182, + "learning_rate": 1.4481133370740701e-05, + "loss": 2.6824, + "step": 6855500 + }, + { + "epoch": 2.131287430036045, + "grad_norm": 7.301271915435791, + "learning_rate": 1.4478542832732589e-05, + "loss": 2.7208, + "step": 6856000 + }, + { + "epoch": 2.131442862316532, + "grad_norm": 9.056465148925781, + "learning_rate": 1.4475952294724474e-05, + "loss": 2.6181, + "step": 6856500 + }, + { + "epoch": 2.1315982945970187, + "grad_norm": 16.042835235595703, + "learning_rate": 1.4473361756716358e-05, + "loss": 2.6791, + "step": 6857000 + }, + { + "epoch": 2.1317537268775055, + "grad_norm": 13.877984046936035, + "learning_rate": 1.4470771218708245e-05, + "loss": 2.6935, + "step": 6857500 + }, + { + "epoch": 2.1319091591579924, + "grad_norm": 11.488877296447754, + "learning_rate": 1.446818068070013e-05, + "loss": 2.6497, + "step": 6858000 + }, + { + "epoch": 2.1320645914384793, + "grad_norm": 10.457076072692871, + "learning_rate": 1.4465590142692014e-05, + "loss": 2.6685, + "step": 6858500 + }, + { + "epoch": 2.132220023718966, + "grad_norm": 10.218343734741211, + "learning_rate": 1.44629996046839e-05, + "loss": 2.6844, + "step": 6859000 + }, + { + "epoch": 2.132375455999453, + "grad_norm": 15.738184928894043, + "learning_rate": 1.4460409066675787e-05, + "loss": 2.6913, + "step": 6859500 + }, + { + "epoch": 2.13253088827994, + "grad_norm": 12.742661476135254, + "learning_rate": 1.4457818528667672e-05, + "loss": 2.6095, + "step": 6860000 + }, + { + "epoch": 2.1326863205604267, + "grad_norm": 25.498764038085938, + "learning_rate": 1.4455227990659556e-05, + "loss": 2.6864, + "step": 6860500 + }, + { + "epoch": 2.1328417528409136, + "grad_norm": 12.016704559326172, + "learning_rate": 1.4452637452651443e-05, + "loss": 2.7143, + "step": 6861000 + }, + { + "epoch": 2.1329971851214005, + "grad_norm": 11.172771453857422, + "learning_rate": 1.4450046914643329e-05, + "loss": 2.7293, + "step": 6861500 + }, + { + "epoch": 2.1331526174018873, + "grad_norm": 10.696175575256348, + "learning_rate": 1.4447456376635212e-05, + "loss": 2.6673, + "step": 6862000 + }, + { + "epoch": 2.133308049682374, + "grad_norm": 9.56389331817627, + "learning_rate": 1.4444865838627098e-05, + "loss": 2.6308, + "step": 6862500 + }, + { + "epoch": 2.133463481962861, + "grad_norm": 11.592123031616211, + "learning_rate": 1.4442275300618985e-05, + "loss": 2.6944, + "step": 6863000 + }, + { + "epoch": 2.133618914243348, + "grad_norm": 9.710485458374023, + "learning_rate": 1.4439684762610869e-05, + "loss": 2.6677, + "step": 6863500 + }, + { + "epoch": 2.133774346523835, + "grad_norm": 10.563745498657227, + "learning_rate": 1.4437094224602754e-05, + "loss": 2.6463, + "step": 6864000 + }, + { + "epoch": 2.1339297788043217, + "grad_norm": 10.29988956451416, + "learning_rate": 1.4434503686594638e-05, + "loss": 2.7331, + "step": 6864500 + }, + { + "epoch": 2.1340852110848085, + "grad_norm": 10.619552612304688, + "learning_rate": 1.4431913148586527e-05, + "loss": 2.7193, + "step": 6865000 + }, + { + "epoch": 2.1342406433652954, + "grad_norm": 9.876992225646973, + "learning_rate": 1.442932261057841e-05, + "loss": 2.6937, + "step": 6865500 + }, + { + "epoch": 2.1343960756457823, + "grad_norm": 9.662376403808594, + "learning_rate": 1.4426732072570296e-05, + "loss": 2.6838, + "step": 6866000 + }, + { + "epoch": 2.134551507926269, + "grad_norm": 13.637968063354492, + "learning_rate": 1.4424141534562183e-05, + "loss": 2.6485, + "step": 6866500 + }, + { + "epoch": 2.134706940206756, + "grad_norm": 7.789614200592041, + "learning_rate": 1.4421550996554067e-05, + "loss": 2.6598, + "step": 6867000 + }, + { + "epoch": 2.134862372487243, + "grad_norm": 8.098257064819336, + "learning_rate": 1.4418960458545952e-05, + "loss": 2.6654, + "step": 6867500 + }, + { + "epoch": 2.1350178047677297, + "grad_norm": 9.021454811096191, + "learning_rate": 1.4416369920537836e-05, + "loss": 2.696, + "step": 6868000 + }, + { + "epoch": 2.1351732370482166, + "grad_norm": 11.606201171875, + "learning_rate": 1.4413779382529723e-05, + "loss": 2.6976, + "step": 6868500 + }, + { + "epoch": 2.1353286693287035, + "grad_norm": 12.1109619140625, + "learning_rate": 1.4411188844521609e-05, + "loss": 2.6803, + "step": 6869000 + }, + { + "epoch": 2.1354841016091903, + "grad_norm": 11.361932754516602, + "learning_rate": 1.4408598306513493e-05, + "loss": 2.6942, + "step": 6869500 + }, + { + "epoch": 2.135639533889677, + "grad_norm": 6.221851348876953, + "learning_rate": 1.4406007768505381e-05, + "loss": 2.7001, + "step": 6870000 + }, + { + "epoch": 2.135794966170164, + "grad_norm": 34.07106399536133, + "learning_rate": 1.4403417230497265e-05, + "loss": 2.6848, + "step": 6870500 + }, + { + "epoch": 2.135950398450651, + "grad_norm": 9.83650016784668, + "learning_rate": 1.440082669248915e-05, + "loss": 2.6559, + "step": 6871000 + }, + { + "epoch": 2.136105830731138, + "grad_norm": 8.065422058105469, + "learning_rate": 1.4398236154481034e-05, + "loss": 2.6649, + "step": 6871500 + }, + { + "epoch": 2.1362612630116247, + "grad_norm": 10.4937744140625, + "learning_rate": 1.4395645616472922e-05, + "loss": 2.6395, + "step": 6872000 + }, + { + "epoch": 2.1364166952921115, + "grad_norm": 9.459569931030273, + "learning_rate": 1.4393055078464807e-05, + "loss": 2.6632, + "step": 6872500 + }, + { + "epoch": 2.1365721275725984, + "grad_norm": 9.591558456420898, + "learning_rate": 1.439046454045669e-05, + "loss": 2.6649, + "step": 6873000 + }, + { + "epoch": 2.1367275598530853, + "grad_norm": 9.429068565368652, + "learning_rate": 1.4387874002448576e-05, + "loss": 2.6873, + "step": 6873500 + }, + { + "epoch": 2.136882992133572, + "grad_norm": 14.76431655883789, + "learning_rate": 1.4385283464440463e-05, + "loss": 2.7083, + "step": 6874000 + }, + { + "epoch": 2.137038424414059, + "grad_norm": 42.05842590332031, + "learning_rate": 1.4382692926432347e-05, + "loss": 2.7143, + "step": 6874500 + }, + { + "epoch": 2.137193856694546, + "grad_norm": 7.535749912261963, + "learning_rate": 1.4380102388424233e-05, + "loss": 2.6797, + "step": 6875000 + }, + { + "epoch": 2.1373492889750327, + "grad_norm": 8.32791519165039, + "learning_rate": 1.437751185041612e-05, + "loss": 2.6745, + "step": 6875500 + }, + { + "epoch": 2.1375047212555196, + "grad_norm": 11.542213439941406, + "learning_rate": 1.4374921312408005e-05, + "loss": 2.7026, + "step": 6876000 + }, + { + "epoch": 2.1376601535360065, + "grad_norm": 13.268099784851074, + "learning_rate": 1.4372330774399889e-05, + "loss": 2.6728, + "step": 6876500 + }, + { + "epoch": 2.1378155858164933, + "grad_norm": 8.63471794128418, + "learning_rate": 1.4369740236391774e-05, + "loss": 2.6446, + "step": 6877000 + }, + { + "epoch": 2.13797101809698, + "grad_norm": 10.68270492553711, + "learning_rate": 1.4367149698383662e-05, + "loss": 2.6846, + "step": 6877500 + }, + { + "epoch": 2.1381264503774675, + "grad_norm": 10.018420219421387, + "learning_rate": 1.4364559160375545e-05, + "loss": 2.711, + "step": 6878000 + }, + { + "epoch": 2.138281882657954, + "grad_norm": 12.660813331604004, + "learning_rate": 1.436196862236743e-05, + "loss": 2.6364, + "step": 6878500 + }, + { + "epoch": 2.1384373149384412, + "grad_norm": 9.646154403686523, + "learning_rate": 1.4359378084359318e-05, + "loss": 2.6817, + "step": 6879000 + }, + { + "epoch": 2.138592747218928, + "grad_norm": 10.418947219848633, + "learning_rate": 1.4356787546351202e-05, + "loss": 2.707, + "step": 6879500 + }, + { + "epoch": 2.138748179499415, + "grad_norm": 9.52601432800293, + "learning_rate": 1.4354197008343087e-05, + "loss": 2.6948, + "step": 6880000 + }, + { + "epoch": 2.138903611779902, + "grad_norm": 11.462823867797852, + "learning_rate": 1.4351606470334971e-05, + "loss": 2.6681, + "step": 6880500 + }, + { + "epoch": 2.1390590440603887, + "grad_norm": 18.56475830078125, + "learning_rate": 1.434901593232686e-05, + "loss": 2.709, + "step": 6881000 + }, + { + "epoch": 2.1392144763408756, + "grad_norm": 23.34811019897461, + "learning_rate": 1.4346425394318744e-05, + "loss": 2.6792, + "step": 6881500 + }, + { + "epoch": 2.1393699086213624, + "grad_norm": 9.9599609375, + "learning_rate": 1.4343834856310629e-05, + "loss": 2.6734, + "step": 6882000 + }, + { + "epoch": 2.1395253409018493, + "grad_norm": 10.280974388122559, + "learning_rate": 1.4341244318302513e-05, + "loss": 2.6397, + "step": 6882500 + }, + { + "epoch": 2.139680773182336, + "grad_norm": 7.8518476486206055, + "learning_rate": 1.43386537802944e-05, + "loss": 2.7281, + "step": 6883000 + }, + { + "epoch": 2.139836205462823, + "grad_norm": 11.902359008789062, + "learning_rate": 1.4336063242286285e-05, + "loss": 2.6811, + "step": 6883500 + }, + { + "epoch": 2.13999163774331, + "grad_norm": 10.203522682189941, + "learning_rate": 1.433347270427817e-05, + "loss": 2.6599, + "step": 6884000 + }, + { + "epoch": 2.140147070023797, + "grad_norm": 10.05117416381836, + "learning_rate": 1.4330882166270056e-05, + "loss": 2.6839, + "step": 6884500 + }, + { + "epoch": 2.1403025023042836, + "grad_norm": 9.849023818969727, + "learning_rate": 1.4328291628261942e-05, + "loss": 2.6668, + "step": 6885000 + }, + { + "epoch": 2.1404579345847705, + "grad_norm": 16.76726531982422, + "learning_rate": 1.4325701090253826e-05, + "loss": 2.6535, + "step": 6885500 + }, + { + "epoch": 2.1406133668652574, + "grad_norm": 8.897229194641113, + "learning_rate": 1.4323110552245711e-05, + "loss": 2.6365, + "step": 6886000 + }, + { + "epoch": 2.1407687991457443, + "grad_norm": 18.05257797241211, + "learning_rate": 1.4320520014237598e-05, + "loss": 2.6876, + "step": 6886500 + }, + { + "epoch": 2.140924231426231, + "grad_norm": 9.049344062805176, + "learning_rate": 1.4317929476229484e-05, + "loss": 2.6563, + "step": 6887000 + }, + { + "epoch": 2.141079663706718, + "grad_norm": 8.577264785766602, + "learning_rate": 1.4315338938221367e-05, + "loss": 2.6824, + "step": 6887500 + }, + { + "epoch": 2.141235095987205, + "grad_norm": 10.245281219482422, + "learning_rate": 1.4312748400213255e-05, + "loss": 2.6577, + "step": 6888000 + }, + { + "epoch": 2.1413905282676917, + "grad_norm": 9.126333236694336, + "learning_rate": 1.431015786220514e-05, + "loss": 2.676, + "step": 6888500 + }, + { + "epoch": 2.1415459605481786, + "grad_norm": 10.118005752563477, + "learning_rate": 1.4307567324197024e-05, + "loss": 2.6565, + "step": 6889000 + }, + { + "epoch": 2.1417013928286655, + "grad_norm": 8.814192771911621, + "learning_rate": 1.430497678618891e-05, + "loss": 2.7058, + "step": 6889500 + }, + { + "epoch": 2.1418568251091523, + "grad_norm": 8.16393756866455, + "learning_rate": 1.4302386248180796e-05, + "loss": 2.6629, + "step": 6890000 + }, + { + "epoch": 2.142012257389639, + "grad_norm": 8.721551895141602, + "learning_rate": 1.429979571017268e-05, + "loss": 2.6823, + "step": 6890500 + }, + { + "epoch": 2.142167689670126, + "grad_norm": 9.663932800292969, + "learning_rate": 1.4297205172164566e-05, + "loss": 2.667, + "step": 6891000 + }, + { + "epoch": 2.142323121950613, + "grad_norm": 13.339838981628418, + "learning_rate": 1.429461463415645e-05, + "loss": 2.6573, + "step": 6891500 + }, + { + "epoch": 2.1424785542311, + "grad_norm": 16.537303924560547, + "learning_rate": 1.4292024096148338e-05, + "loss": 2.6626, + "step": 6892000 + }, + { + "epoch": 2.1426339865115867, + "grad_norm": 25.246292114257812, + "learning_rate": 1.4289433558140222e-05, + "loss": 2.702, + "step": 6892500 + }, + { + "epoch": 2.1427894187920735, + "grad_norm": 10.30417251586914, + "learning_rate": 1.4286843020132107e-05, + "loss": 2.6608, + "step": 6893000 + }, + { + "epoch": 2.1429448510725604, + "grad_norm": 10.033483505249023, + "learning_rate": 1.4284252482123995e-05, + "loss": 2.6365, + "step": 6893500 + }, + { + "epoch": 2.1431002833530473, + "grad_norm": 7.693051338195801, + "learning_rate": 1.4281661944115878e-05, + "loss": 2.654, + "step": 6894000 + }, + { + "epoch": 2.143255715633534, + "grad_norm": 11.793916702270508, + "learning_rate": 1.4279071406107764e-05, + "loss": 2.6457, + "step": 6894500 + }, + { + "epoch": 2.143411147914021, + "grad_norm": 15.174688339233398, + "learning_rate": 1.4276480868099648e-05, + "loss": 2.6732, + "step": 6895000 + }, + { + "epoch": 2.143566580194508, + "grad_norm": 7.827582836151123, + "learning_rate": 1.4273890330091535e-05, + "loss": 2.677, + "step": 6895500 + }, + { + "epoch": 2.1437220124749947, + "grad_norm": 10.292912483215332, + "learning_rate": 1.427129979208342e-05, + "loss": 2.6885, + "step": 6896000 + }, + { + "epoch": 2.1438774447554816, + "grad_norm": 14.575434684753418, + "learning_rate": 1.4268709254075304e-05, + "loss": 2.6623, + "step": 6896500 + }, + { + "epoch": 2.1440328770359685, + "grad_norm": 11.24375057220459, + "learning_rate": 1.4266118716067193e-05, + "loss": 2.687, + "step": 6897000 + }, + { + "epoch": 2.1441883093164553, + "grad_norm": 20.14803123474121, + "learning_rate": 1.4263528178059077e-05, + "loss": 2.6951, + "step": 6897500 + }, + { + "epoch": 2.144343741596942, + "grad_norm": 10.06086540222168, + "learning_rate": 1.4260937640050962e-05, + "loss": 2.6737, + "step": 6898000 + }, + { + "epoch": 2.144499173877429, + "grad_norm": 9.426129341125488, + "learning_rate": 1.4258347102042846e-05, + "loss": 2.691, + "step": 6898500 + }, + { + "epoch": 2.144654606157916, + "grad_norm": 10.86063003540039, + "learning_rate": 1.4255756564034733e-05, + "loss": 2.6972, + "step": 6899000 + }, + { + "epoch": 2.144810038438403, + "grad_norm": 12.535907745361328, + "learning_rate": 1.4253166026026618e-05, + "loss": 2.649, + "step": 6899500 + }, + { + "epoch": 2.1449654707188897, + "grad_norm": 9.795196533203125, + "learning_rate": 1.4250575488018502e-05, + "loss": 2.7043, + "step": 6900000 + }, + { + "epoch": 2.1451209029993765, + "grad_norm": 10.044134140014648, + "learning_rate": 1.4247984950010388e-05, + "loss": 2.6869, + "step": 6900500 + }, + { + "epoch": 2.1452763352798634, + "grad_norm": 9.287060737609863, + "learning_rate": 1.4245394412002275e-05, + "loss": 2.6693, + "step": 6901000 + }, + { + "epoch": 2.1454317675603507, + "grad_norm": 10.35102653503418, + "learning_rate": 1.4242803873994159e-05, + "loss": 2.6875, + "step": 6901500 + }, + { + "epoch": 2.145587199840837, + "grad_norm": 11.111834526062012, + "learning_rate": 1.4240213335986044e-05, + "loss": 2.6261, + "step": 6902000 + }, + { + "epoch": 2.1457426321213244, + "grad_norm": 9.719276428222656, + "learning_rate": 1.4237622797977931e-05, + "loss": 2.6221, + "step": 6902500 + }, + { + "epoch": 2.1458980644018113, + "grad_norm": 8.443910598754883, + "learning_rate": 1.4235032259969817e-05, + "loss": 2.6778, + "step": 6903000 + }, + { + "epoch": 2.146053496682298, + "grad_norm": 10.588905334472656, + "learning_rate": 1.42324417219617e-05, + "loss": 2.6302, + "step": 6903500 + }, + { + "epoch": 2.146208928962785, + "grad_norm": 7.721853256225586, + "learning_rate": 1.4229851183953586e-05, + "loss": 2.6493, + "step": 6904000 + }, + { + "epoch": 2.146364361243272, + "grad_norm": 11.523174285888672, + "learning_rate": 1.4227260645945473e-05, + "loss": 2.6878, + "step": 6904500 + }, + { + "epoch": 2.1465197935237588, + "grad_norm": 8.374767303466797, + "learning_rate": 1.4224670107937357e-05, + "loss": 2.6819, + "step": 6905000 + }, + { + "epoch": 2.1466752258042456, + "grad_norm": 11.39258098602295, + "learning_rate": 1.4222079569929242e-05, + "loss": 2.6519, + "step": 6905500 + }, + { + "epoch": 2.1468306580847325, + "grad_norm": 62.34596633911133, + "learning_rate": 1.421948903192113e-05, + "loss": 2.6929, + "step": 6906000 + }, + { + "epoch": 2.1469860903652194, + "grad_norm": 13.698530197143555, + "learning_rate": 1.4216898493913013e-05, + "loss": 2.6965, + "step": 6906500 + }, + { + "epoch": 2.1471415226457062, + "grad_norm": 10.973458290100098, + "learning_rate": 1.4214307955904899e-05, + "loss": 2.6776, + "step": 6907000 + }, + { + "epoch": 2.147296954926193, + "grad_norm": 10.023569107055664, + "learning_rate": 1.4211717417896784e-05, + "loss": 2.6954, + "step": 6907500 + }, + { + "epoch": 2.14745238720668, + "grad_norm": 19.089096069335938, + "learning_rate": 1.4209126879888671e-05, + "loss": 2.6374, + "step": 6908000 + }, + { + "epoch": 2.147607819487167, + "grad_norm": 12.936967849731445, + "learning_rate": 1.4206536341880555e-05, + "loss": 2.6681, + "step": 6908500 + }, + { + "epoch": 2.1477632517676537, + "grad_norm": 9.34429931640625, + "learning_rate": 1.420394580387244e-05, + "loss": 2.6841, + "step": 6909000 + }, + { + "epoch": 2.1479186840481406, + "grad_norm": 18.173444747924805, + "learning_rate": 1.4201355265864328e-05, + "loss": 2.641, + "step": 6909500 + }, + { + "epoch": 2.1480741163286274, + "grad_norm": 16.110794067382812, + "learning_rate": 1.4198764727856211e-05, + "loss": 2.6465, + "step": 6910000 + }, + { + "epoch": 2.1482295486091143, + "grad_norm": 9.687171936035156, + "learning_rate": 1.4196174189848097e-05, + "loss": 2.6531, + "step": 6910500 + }, + { + "epoch": 2.148384980889601, + "grad_norm": 9.173755645751953, + "learning_rate": 1.419358365183998e-05, + "loss": 2.6628, + "step": 6911000 + }, + { + "epoch": 2.148540413170088, + "grad_norm": 9.052120208740234, + "learning_rate": 1.419099311383187e-05, + "loss": 2.6915, + "step": 6911500 + }, + { + "epoch": 2.148695845450575, + "grad_norm": 9.992536544799805, + "learning_rate": 1.4188402575823753e-05, + "loss": 2.6844, + "step": 6912000 + }, + { + "epoch": 2.1488512777310618, + "grad_norm": 8.754326820373535, + "learning_rate": 1.4185812037815639e-05, + "loss": 2.6193, + "step": 6912500 + }, + { + "epoch": 2.1490067100115486, + "grad_norm": 7.189298629760742, + "learning_rate": 1.4183221499807522e-05, + "loss": 2.6019, + "step": 6913000 + }, + { + "epoch": 2.1491621422920355, + "grad_norm": 9.271876335144043, + "learning_rate": 1.418063096179941e-05, + "loss": 2.7197, + "step": 6913500 + }, + { + "epoch": 2.1493175745725224, + "grad_norm": 10.109270095825195, + "learning_rate": 1.4178040423791295e-05, + "loss": 2.6709, + "step": 6914000 + }, + { + "epoch": 2.1494730068530092, + "grad_norm": 9.342630386352539, + "learning_rate": 1.4175449885783179e-05, + "loss": 2.708, + "step": 6914500 + }, + { + "epoch": 2.149628439133496, + "grad_norm": 8.449195861816406, + "learning_rate": 1.4172859347775066e-05, + "loss": 2.7015, + "step": 6915000 + }, + { + "epoch": 2.149783871413983, + "grad_norm": 9.914562225341797, + "learning_rate": 1.4170268809766951e-05, + "loss": 2.6093, + "step": 6915500 + }, + { + "epoch": 2.14993930369447, + "grad_norm": 9.384052276611328, + "learning_rate": 1.4167678271758835e-05, + "loss": 2.6477, + "step": 6916000 + }, + { + "epoch": 2.1500947359749567, + "grad_norm": 14.816511154174805, + "learning_rate": 1.416508773375072e-05, + "loss": 2.6399, + "step": 6916500 + }, + { + "epoch": 2.1502501682554436, + "grad_norm": 10.141983032226562, + "learning_rate": 1.4162497195742608e-05, + "loss": 2.6783, + "step": 6917000 + }, + { + "epoch": 2.1504056005359304, + "grad_norm": 8.511673927307129, + "learning_rate": 1.4159906657734493e-05, + "loss": 2.6575, + "step": 6917500 + }, + { + "epoch": 2.1505610328164173, + "grad_norm": 10.909278869628906, + "learning_rate": 1.4157316119726377e-05, + "loss": 2.6795, + "step": 6918000 + }, + { + "epoch": 2.150716465096904, + "grad_norm": 9.295022010803223, + "learning_rate": 1.4154725581718264e-05, + "loss": 2.6364, + "step": 6918500 + }, + { + "epoch": 2.150871897377391, + "grad_norm": 8.808869361877441, + "learning_rate": 1.415213504371015e-05, + "loss": 2.6837, + "step": 6919000 + }, + { + "epoch": 2.151027329657878, + "grad_norm": 7.70533561706543, + "learning_rate": 1.4149544505702033e-05, + "loss": 2.7156, + "step": 6919500 + }, + { + "epoch": 2.1511827619383648, + "grad_norm": 10.375288009643555, + "learning_rate": 1.4146953967693919e-05, + "loss": 2.7103, + "step": 6920000 + }, + { + "epoch": 2.1513381942188516, + "grad_norm": 10.768021583557129, + "learning_rate": 1.4144363429685806e-05, + "loss": 2.6312, + "step": 6920500 + }, + { + "epoch": 2.1514936264993385, + "grad_norm": 9.785130500793457, + "learning_rate": 1.414177289167769e-05, + "loss": 2.6405, + "step": 6921000 + }, + { + "epoch": 2.1516490587798254, + "grad_norm": 15.448667526245117, + "learning_rate": 1.4139182353669575e-05, + "loss": 2.6984, + "step": 6921500 + }, + { + "epoch": 2.1518044910603122, + "grad_norm": 11.631485939025879, + "learning_rate": 1.4136591815661459e-05, + "loss": 2.6655, + "step": 6922000 + }, + { + "epoch": 2.151959923340799, + "grad_norm": 14.099298477172852, + "learning_rate": 1.4134001277653348e-05, + "loss": 2.6521, + "step": 6922500 + }, + { + "epoch": 2.152115355621286, + "grad_norm": 10.679364204406738, + "learning_rate": 1.4131410739645232e-05, + "loss": 2.6633, + "step": 6923000 + }, + { + "epoch": 2.152270787901773, + "grad_norm": 10.632946968078613, + "learning_rate": 1.4128820201637117e-05, + "loss": 2.7326, + "step": 6923500 + }, + { + "epoch": 2.1524262201822597, + "grad_norm": 10.764555931091309, + "learning_rate": 1.4126229663629004e-05, + "loss": 2.6322, + "step": 6924000 + }, + { + "epoch": 2.1525816524627466, + "grad_norm": 9.743834495544434, + "learning_rate": 1.4123639125620888e-05, + "loss": 2.6335, + "step": 6924500 + }, + { + "epoch": 2.1527370847432334, + "grad_norm": 8.142104148864746, + "learning_rate": 1.4121048587612773e-05, + "loss": 2.6526, + "step": 6925000 + }, + { + "epoch": 2.1528925170237203, + "grad_norm": 8.904022216796875, + "learning_rate": 1.4118458049604657e-05, + "loss": 2.6691, + "step": 6925500 + }, + { + "epoch": 2.1530479493042076, + "grad_norm": 8.553689956665039, + "learning_rate": 1.4115867511596544e-05, + "loss": 2.7278, + "step": 6926000 + }, + { + "epoch": 2.153203381584694, + "grad_norm": 8.799236297607422, + "learning_rate": 1.411327697358843e-05, + "loss": 2.6499, + "step": 6926500 + }, + { + "epoch": 2.1533588138651814, + "grad_norm": 13.40770149230957, + "learning_rate": 1.4110686435580314e-05, + "loss": 2.615, + "step": 6927000 + }, + { + "epoch": 2.153514246145668, + "grad_norm": 9.18278694152832, + "learning_rate": 1.4108095897572202e-05, + "loss": 2.6534, + "step": 6927500 + }, + { + "epoch": 2.153669678426155, + "grad_norm": 32.21303176879883, + "learning_rate": 1.4105505359564086e-05, + "loss": 2.6924, + "step": 6928000 + }, + { + "epoch": 2.153825110706642, + "grad_norm": 22.05892562866211, + "learning_rate": 1.4102914821555972e-05, + "loss": 2.6167, + "step": 6928500 + }, + { + "epoch": 2.153980542987129, + "grad_norm": 8.985677719116211, + "learning_rate": 1.4100324283547855e-05, + "loss": 2.6387, + "step": 6929000 + }, + { + "epoch": 2.1541359752676157, + "grad_norm": 8.494823455810547, + "learning_rate": 1.4097733745539743e-05, + "loss": 2.6487, + "step": 6929500 + }, + { + "epoch": 2.1542914075481026, + "grad_norm": 31.150304794311523, + "learning_rate": 1.4095143207531628e-05, + "loss": 2.6834, + "step": 6930000 + }, + { + "epoch": 2.1544468398285894, + "grad_norm": 8.919283866882324, + "learning_rate": 1.4092552669523512e-05, + "loss": 2.6792, + "step": 6930500 + }, + { + "epoch": 2.1546022721090763, + "grad_norm": 9.573673248291016, + "learning_rate": 1.4089962131515397e-05, + "loss": 2.6217, + "step": 6931000 + }, + { + "epoch": 2.154757704389563, + "grad_norm": 10.540251731872559, + "learning_rate": 1.4087371593507284e-05, + "loss": 2.6814, + "step": 6931500 + }, + { + "epoch": 2.15491313667005, + "grad_norm": 18.192447662353516, + "learning_rate": 1.4084781055499168e-05, + "loss": 2.6317, + "step": 6932000 + }, + { + "epoch": 2.155068568950537, + "grad_norm": 9.87566089630127, + "learning_rate": 1.4082190517491054e-05, + "loss": 2.6708, + "step": 6932500 + }, + { + "epoch": 2.1552240012310238, + "grad_norm": 13.087434768676758, + "learning_rate": 1.407959997948294e-05, + "loss": 2.6364, + "step": 6933000 + }, + { + "epoch": 2.1553794335115106, + "grad_norm": 6.345223903656006, + "learning_rate": 1.4077009441474826e-05, + "loss": 2.6608, + "step": 6933500 + }, + { + "epoch": 2.1555348657919975, + "grad_norm": 9.222509384155273, + "learning_rate": 1.407441890346671e-05, + "loss": 2.6548, + "step": 6934000 + }, + { + "epoch": 2.1556902980724844, + "grad_norm": 9.406241416931152, + "learning_rate": 1.4071828365458595e-05, + "loss": 2.6928, + "step": 6934500 + }, + { + "epoch": 2.1558457303529712, + "grad_norm": 6.719583034515381, + "learning_rate": 1.4069237827450483e-05, + "loss": 2.6085, + "step": 6935000 + }, + { + "epoch": 2.156001162633458, + "grad_norm": 9.71878719329834, + "learning_rate": 1.4066647289442366e-05, + "loss": 2.6951, + "step": 6935500 + }, + { + "epoch": 2.156156594913945, + "grad_norm": 12.36709976196289, + "learning_rate": 1.4064056751434252e-05, + "loss": 2.6163, + "step": 6936000 + }, + { + "epoch": 2.156312027194432, + "grad_norm": 10.9998140335083, + "learning_rate": 1.4061466213426139e-05, + "loss": 2.6281, + "step": 6936500 + }, + { + "epoch": 2.1564674594749187, + "grad_norm": 9.272518157958984, + "learning_rate": 1.4058875675418023e-05, + "loss": 2.7129, + "step": 6937000 + }, + { + "epoch": 2.1566228917554056, + "grad_norm": 26.016830444335938, + "learning_rate": 1.4056285137409908e-05, + "loss": 2.6724, + "step": 6937500 + }, + { + "epoch": 2.1567783240358924, + "grad_norm": 11.671568870544434, + "learning_rate": 1.4053694599401792e-05, + "loss": 2.6531, + "step": 6938000 + }, + { + "epoch": 2.1569337563163793, + "grad_norm": 16.546615600585938, + "learning_rate": 1.405110406139368e-05, + "loss": 2.6436, + "step": 6938500 + }, + { + "epoch": 2.157089188596866, + "grad_norm": 10.839771270751953, + "learning_rate": 1.4048513523385565e-05, + "loss": 2.6639, + "step": 6939000 + }, + { + "epoch": 2.157244620877353, + "grad_norm": 7.997097969055176, + "learning_rate": 1.404592298537745e-05, + "loss": 2.6837, + "step": 6939500 + }, + { + "epoch": 2.15740005315784, + "grad_norm": 9.805010795593262, + "learning_rate": 1.4043332447369334e-05, + "loss": 2.6704, + "step": 6940000 + }, + { + "epoch": 2.1575554854383268, + "grad_norm": 9.922659873962402, + "learning_rate": 1.4040741909361221e-05, + "loss": 2.685, + "step": 6940500 + }, + { + "epoch": 2.1577109177188136, + "grad_norm": 31.650474548339844, + "learning_rate": 1.4038151371353106e-05, + "loss": 2.6703, + "step": 6941000 + }, + { + "epoch": 2.1578663499993005, + "grad_norm": 9.26950740814209, + "learning_rate": 1.403556083334499e-05, + "loss": 2.673, + "step": 6941500 + }, + { + "epoch": 2.1580217822797874, + "grad_norm": 11.939724922180176, + "learning_rate": 1.4032970295336877e-05, + "loss": 2.673, + "step": 6942000 + }, + { + "epoch": 2.1581772145602742, + "grad_norm": 12.659272193908691, + "learning_rate": 1.4030379757328763e-05, + "loss": 2.6446, + "step": 6942500 + }, + { + "epoch": 2.158332646840761, + "grad_norm": 10.310530662536621, + "learning_rate": 1.4027789219320647e-05, + "loss": 2.6806, + "step": 6943000 + }, + { + "epoch": 2.158488079121248, + "grad_norm": 10.006417274475098, + "learning_rate": 1.4025198681312532e-05, + "loss": 2.6785, + "step": 6943500 + }, + { + "epoch": 2.158643511401735, + "grad_norm": 17.451499938964844, + "learning_rate": 1.402260814330442e-05, + "loss": 2.6485, + "step": 6944000 + }, + { + "epoch": 2.1587989436822217, + "grad_norm": 8.412123680114746, + "learning_rate": 1.4020017605296305e-05, + "loss": 2.6215, + "step": 6944500 + }, + { + "epoch": 2.1589543759627086, + "grad_norm": 12.395829200744629, + "learning_rate": 1.4017427067288188e-05, + "loss": 2.6524, + "step": 6945000 + }, + { + "epoch": 2.1591098082431954, + "grad_norm": 10.365640640258789, + "learning_rate": 1.4014836529280076e-05, + "loss": 2.6682, + "step": 6945500 + }, + { + "epoch": 2.1592652405236823, + "grad_norm": 10.141844749450684, + "learning_rate": 1.4012245991271961e-05, + "loss": 2.638, + "step": 6946000 + }, + { + "epoch": 2.159420672804169, + "grad_norm": 11.560178756713867, + "learning_rate": 1.4009655453263845e-05, + "loss": 2.6378, + "step": 6946500 + }, + { + "epoch": 2.159576105084656, + "grad_norm": 8.65772819519043, + "learning_rate": 1.400706491525573e-05, + "loss": 2.6509, + "step": 6947000 + }, + { + "epoch": 2.159731537365143, + "grad_norm": 9.865620613098145, + "learning_rate": 1.4004474377247617e-05, + "loss": 2.6503, + "step": 6947500 + }, + { + "epoch": 2.1598869696456298, + "grad_norm": 14.092305183410645, + "learning_rate": 1.4001883839239501e-05, + "loss": 2.6642, + "step": 6948000 + }, + { + "epoch": 2.1600424019261166, + "grad_norm": 9.670404434204102, + "learning_rate": 1.3999293301231387e-05, + "loss": 2.6468, + "step": 6948500 + }, + { + "epoch": 2.1601978342066035, + "grad_norm": 8.065197944641113, + "learning_rate": 1.399670276322327e-05, + "loss": 2.6637, + "step": 6949000 + }, + { + "epoch": 2.160353266487091, + "grad_norm": 8.49905776977539, + "learning_rate": 1.399411222521516e-05, + "loss": 2.6545, + "step": 6949500 + }, + { + "epoch": 2.1605086987675772, + "grad_norm": 10.280067443847656, + "learning_rate": 1.3991521687207043e-05, + "loss": 2.713, + "step": 6950000 + }, + { + "epoch": 2.1606641310480645, + "grad_norm": 8.464580535888672, + "learning_rate": 1.3988931149198928e-05, + "loss": 2.627, + "step": 6950500 + }, + { + "epoch": 2.1608195633285514, + "grad_norm": 9.496838569641113, + "learning_rate": 1.3986340611190816e-05, + "loss": 2.6795, + "step": 6951000 + }, + { + "epoch": 2.1609749956090383, + "grad_norm": 10.073761940002441, + "learning_rate": 1.39837500731827e-05, + "loss": 2.6819, + "step": 6951500 + }, + { + "epoch": 2.161130427889525, + "grad_norm": 8.32746696472168, + "learning_rate": 1.3981159535174585e-05, + "loss": 2.6599, + "step": 6952000 + }, + { + "epoch": 2.161285860170012, + "grad_norm": 9.211359024047852, + "learning_rate": 1.3978568997166469e-05, + "loss": 2.6913, + "step": 6952500 + }, + { + "epoch": 2.161441292450499, + "grad_norm": 10.079764366149902, + "learning_rate": 1.3975978459158356e-05, + "loss": 2.676, + "step": 6953000 + }, + { + "epoch": 2.1615967247309857, + "grad_norm": 7.122631072998047, + "learning_rate": 1.3973387921150241e-05, + "loss": 2.6415, + "step": 6953500 + }, + { + "epoch": 2.1617521570114726, + "grad_norm": 8.142557144165039, + "learning_rate": 1.3970797383142125e-05, + "loss": 2.6784, + "step": 6954000 + }, + { + "epoch": 2.1619075892919595, + "grad_norm": 10.319053649902344, + "learning_rate": 1.3968206845134014e-05, + "loss": 2.6614, + "step": 6954500 + }, + { + "epoch": 2.1620630215724463, + "grad_norm": 9.756387710571289, + "learning_rate": 1.3965616307125898e-05, + "loss": 2.6195, + "step": 6955000 + }, + { + "epoch": 2.162218453852933, + "grad_norm": 8.37960147857666, + "learning_rate": 1.3963025769117783e-05, + "loss": 2.7177, + "step": 6955500 + }, + { + "epoch": 2.16237388613342, + "grad_norm": 8.27995777130127, + "learning_rate": 1.3960435231109667e-05, + "loss": 2.7223, + "step": 6956000 + }, + { + "epoch": 2.162529318413907, + "grad_norm": 11.522819519042969, + "learning_rate": 1.3957844693101554e-05, + "loss": 2.6582, + "step": 6956500 + }, + { + "epoch": 2.162684750694394, + "grad_norm": 10.785483360290527, + "learning_rate": 1.395525415509344e-05, + "loss": 2.6806, + "step": 6957000 + }, + { + "epoch": 2.1628401829748807, + "grad_norm": 9.870943069458008, + "learning_rate": 1.3952663617085323e-05, + "loss": 2.6795, + "step": 6957500 + }, + { + "epoch": 2.1629956152553675, + "grad_norm": 8.918463706970215, + "learning_rate": 1.3950073079077209e-05, + "loss": 2.6528, + "step": 6958000 + }, + { + "epoch": 2.1631510475358544, + "grad_norm": 8.961310386657715, + "learning_rate": 1.3947482541069096e-05, + "loss": 2.6328, + "step": 6958500 + }, + { + "epoch": 2.1633064798163413, + "grad_norm": 8.362776756286621, + "learning_rate": 1.394489200306098e-05, + "loss": 2.6413, + "step": 6959000 + }, + { + "epoch": 2.163461912096828, + "grad_norm": 8.762429237365723, + "learning_rate": 1.3942301465052865e-05, + "loss": 2.6497, + "step": 6959500 + }, + { + "epoch": 2.163617344377315, + "grad_norm": 11.084994316101074, + "learning_rate": 1.3939710927044752e-05, + "loss": 2.6842, + "step": 6960000 + }, + { + "epoch": 2.163772776657802, + "grad_norm": 52.916603088378906, + "learning_rate": 1.3937120389036638e-05, + "loss": 2.659, + "step": 6960500 + }, + { + "epoch": 2.1639282089382887, + "grad_norm": 12.14886474609375, + "learning_rate": 1.3934529851028521e-05, + "loss": 2.6419, + "step": 6961000 + }, + { + "epoch": 2.1640836412187756, + "grad_norm": 10.830397605895996, + "learning_rate": 1.3931939313020407e-05, + "loss": 2.6342, + "step": 6961500 + }, + { + "epoch": 2.1642390734992625, + "grad_norm": 17.275705337524414, + "learning_rate": 1.3929348775012294e-05, + "loss": 2.6891, + "step": 6962000 + }, + { + "epoch": 2.1643945057797493, + "grad_norm": 10.97169303894043, + "learning_rate": 1.3926758237004178e-05, + "loss": 2.648, + "step": 6962500 + }, + { + "epoch": 2.164549938060236, + "grad_norm": 12.926855087280273, + "learning_rate": 1.3924167698996063e-05, + "loss": 2.7151, + "step": 6963000 + }, + { + "epoch": 2.164705370340723, + "grad_norm": 16.11216163635254, + "learning_rate": 1.392157716098795e-05, + "loss": 2.6122, + "step": 6963500 + }, + { + "epoch": 2.16486080262121, + "grad_norm": 9.032026290893555, + "learning_rate": 1.3918986622979834e-05, + "loss": 2.6273, + "step": 6964000 + }, + { + "epoch": 2.165016234901697, + "grad_norm": 9.34504508972168, + "learning_rate": 1.391639608497172e-05, + "loss": 2.6782, + "step": 6964500 + }, + { + "epoch": 2.1651716671821837, + "grad_norm": 11.412813186645508, + "learning_rate": 1.3913805546963603e-05, + "loss": 2.7144, + "step": 6965000 + }, + { + "epoch": 2.1653270994626705, + "grad_norm": 11.227768898010254, + "learning_rate": 1.3911215008955492e-05, + "loss": 2.6301, + "step": 6965500 + }, + { + "epoch": 2.1654825317431574, + "grad_norm": 9.617085456848145, + "learning_rate": 1.3908624470947376e-05, + "loss": 2.6646, + "step": 6966000 + }, + { + "epoch": 2.1656379640236443, + "grad_norm": 16.61867332458496, + "learning_rate": 1.3906033932939261e-05, + "loss": 2.6653, + "step": 6966500 + }, + { + "epoch": 2.165793396304131, + "grad_norm": 9.85401439666748, + "learning_rate": 1.3903443394931145e-05, + "loss": 2.6866, + "step": 6967000 + }, + { + "epoch": 2.165948828584618, + "grad_norm": 10.164339065551758, + "learning_rate": 1.3900852856923032e-05, + "loss": 2.6757, + "step": 6967500 + }, + { + "epoch": 2.166104260865105, + "grad_norm": 25.045669555664062, + "learning_rate": 1.3898262318914918e-05, + "loss": 2.6401, + "step": 6968000 + }, + { + "epoch": 2.1662596931455917, + "grad_norm": 9.91592788696289, + "learning_rate": 1.3895671780906802e-05, + "loss": 2.6478, + "step": 6968500 + }, + { + "epoch": 2.1664151254260786, + "grad_norm": 9.828829765319824, + "learning_rate": 1.3893081242898689e-05, + "loss": 2.6083, + "step": 6969000 + }, + { + "epoch": 2.1665705577065655, + "grad_norm": 10.306391716003418, + "learning_rate": 1.3890490704890574e-05, + "loss": 2.673, + "step": 6969500 + }, + { + "epoch": 2.1667259899870523, + "grad_norm": 10.29233455657959, + "learning_rate": 1.3887900166882458e-05, + "loss": 2.6497, + "step": 6970000 + }, + { + "epoch": 2.166881422267539, + "grad_norm": 8.638379096984863, + "learning_rate": 1.3885309628874343e-05, + "loss": 2.6637, + "step": 6970500 + }, + { + "epoch": 2.167036854548026, + "grad_norm": 11.891406059265137, + "learning_rate": 1.388271909086623e-05, + "loss": 2.6812, + "step": 6971000 + }, + { + "epoch": 2.167192286828513, + "grad_norm": 8.670710563659668, + "learning_rate": 1.3880128552858116e-05, + "loss": 2.6705, + "step": 6971500 + }, + { + "epoch": 2.167347719109, + "grad_norm": 9.323163032531738, + "learning_rate": 1.387753801485e-05, + "loss": 2.6727, + "step": 6972000 + }, + { + "epoch": 2.1675031513894867, + "grad_norm": 9.370068550109863, + "learning_rate": 1.3874947476841887e-05, + "loss": 2.7054, + "step": 6972500 + }, + { + "epoch": 2.1676585836699735, + "grad_norm": 8.992891311645508, + "learning_rate": 1.3872356938833772e-05, + "loss": 2.6858, + "step": 6973000 + }, + { + "epoch": 2.1678140159504604, + "grad_norm": 9.171462059020996, + "learning_rate": 1.3869766400825656e-05, + "loss": 2.6679, + "step": 6973500 + }, + { + "epoch": 2.1679694482309477, + "grad_norm": 10.207688331604004, + "learning_rate": 1.3867175862817542e-05, + "loss": 2.674, + "step": 6974000 + }, + { + "epoch": 2.168124880511434, + "grad_norm": 8.458316802978516, + "learning_rate": 1.3864585324809429e-05, + "loss": 2.7068, + "step": 6974500 + }, + { + "epoch": 2.1682803127919215, + "grad_norm": 12.001093864440918, + "learning_rate": 1.3861994786801313e-05, + "loss": 2.6716, + "step": 6975000 + }, + { + "epoch": 2.1684357450724083, + "grad_norm": 22.355304718017578, + "learning_rate": 1.3859404248793198e-05, + "loss": 2.6742, + "step": 6975500 + }, + { + "epoch": 2.168591177352895, + "grad_norm": 9.881884574890137, + "learning_rate": 1.3856813710785082e-05, + "loss": 2.6994, + "step": 6976000 + }, + { + "epoch": 2.168746609633382, + "grad_norm": 103.25052642822266, + "learning_rate": 1.385422317277697e-05, + "loss": 2.6233, + "step": 6976500 + }, + { + "epoch": 2.168902041913869, + "grad_norm": 12.85307788848877, + "learning_rate": 1.3851632634768854e-05, + "loss": 2.6564, + "step": 6977000 + }, + { + "epoch": 2.169057474194356, + "grad_norm": 10.088165283203125, + "learning_rate": 1.384904209676074e-05, + "loss": 2.6706, + "step": 6977500 + }, + { + "epoch": 2.1692129064748427, + "grad_norm": 11.107019424438477, + "learning_rate": 1.3846451558752627e-05, + "loss": 2.6709, + "step": 6978000 + }, + { + "epoch": 2.1693683387553295, + "grad_norm": 16.045204162597656, + "learning_rate": 1.384386102074451e-05, + "loss": 2.6611, + "step": 6978500 + }, + { + "epoch": 2.1695237710358164, + "grad_norm": 8.340913772583008, + "learning_rate": 1.3841270482736396e-05, + "loss": 2.6653, + "step": 6979000 + }, + { + "epoch": 2.1696792033163033, + "grad_norm": 5.829801559448242, + "learning_rate": 1.383867994472828e-05, + "loss": 2.6409, + "step": 6979500 + }, + { + "epoch": 2.16983463559679, + "grad_norm": 7.44961404800415, + "learning_rate": 1.3836089406720167e-05, + "loss": 2.6538, + "step": 6980000 + }, + { + "epoch": 2.169990067877277, + "grad_norm": 8.583611488342285, + "learning_rate": 1.3833498868712053e-05, + "loss": 2.6264, + "step": 6980500 + }, + { + "epoch": 2.170145500157764, + "grad_norm": 9.372830390930176, + "learning_rate": 1.3830908330703936e-05, + "loss": 2.6408, + "step": 6981000 + }, + { + "epoch": 2.1703009324382507, + "grad_norm": 27.87590217590332, + "learning_rate": 1.3828317792695825e-05, + "loss": 2.6305, + "step": 6981500 + }, + { + "epoch": 2.1704563647187376, + "grad_norm": 10.344744682312012, + "learning_rate": 1.3825727254687709e-05, + "loss": 2.6759, + "step": 6982000 + }, + { + "epoch": 2.1706117969992245, + "grad_norm": 10.62699031829834, + "learning_rate": 1.3823136716679594e-05, + "loss": 2.7193, + "step": 6982500 + }, + { + "epoch": 2.1707672292797113, + "grad_norm": 16.602432250976562, + "learning_rate": 1.3820546178671478e-05, + "loss": 2.6243, + "step": 6983000 + }, + { + "epoch": 2.170922661560198, + "grad_norm": 9.93406867980957, + "learning_rate": 1.3817955640663365e-05, + "loss": 2.6977, + "step": 6983500 + }, + { + "epoch": 2.171078093840685, + "grad_norm": 9.260684967041016, + "learning_rate": 1.3815365102655251e-05, + "loss": 2.6372, + "step": 6984000 + }, + { + "epoch": 2.171233526121172, + "grad_norm": 28.777498245239258, + "learning_rate": 1.3812774564647135e-05, + "loss": 2.6582, + "step": 6984500 + }, + { + "epoch": 2.171388958401659, + "grad_norm": 9.633471488952637, + "learning_rate": 1.381018402663902e-05, + "loss": 2.7015, + "step": 6985000 + }, + { + "epoch": 2.1715443906821457, + "grad_norm": 9.331233978271484, + "learning_rate": 1.3807593488630907e-05, + "loss": 2.6602, + "step": 6985500 + }, + { + "epoch": 2.1716998229626325, + "grad_norm": 20.762287139892578, + "learning_rate": 1.3805002950622791e-05, + "loss": 2.6893, + "step": 6986000 + }, + { + "epoch": 2.1718552552431194, + "grad_norm": 10.05212116241455, + "learning_rate": 1.3802412412614676e-05, + "loss": 2.6268, + "step": 6986500 + }, + { + "epoch": 2.1720106875236063, + "grad_norm": 12.114684104919434, + "learning_rate": 1.3799821874606564e-05, + "loss": 2.642, + "step": 6987000 + }, + { + "epoch": 2.172166119804093, + "grad_norm": 8.659842491149902, + "learning_rate": 1.3797231336598449e-05, + "loss": 2.6816, + "step": 6987500 + }, + { + "epoch": 2.17232155208458, + "grad_norm": 9.634808540344238, + "learning_rate": 1.3794640798590333e-05, + "loss": 2.6658, + "step": 6988000 + }, + { + "epoch": 2.172476984365067, + "grad_norm": 7.977669715881348, + "learning_rate": 1.3792050260582218e-05, + "loss": 2.6477, + "step": 6988500 + }, + { + "epoch": 2.1726324166455537, + "grad_norm": 12.801535606384277, + "learning_rate": 1.3789459722574105e-05, + "loss": 2.7015, + "step": 6989000 + }, + { + "epoch": 2.1727878489260406, + "grad_norm": 42.327880859375, + "learning_rate": 1.378686918456599e-05, + "loss": 2.6408, + "step": 6989500 + }, + { + "epoch": 2.1729432812065275, + "grad_norm": 16.02213478088379, + "learning_rate": 1.3784278646557875e-05, + "loss": 2.6828, + "step": 6990000 + }, + { + "epoch": 2.1730987134870143, + "grad_norm": 9.453802108764648, + "learning_rate": 1.3781688108549762e-05, + "loss": 2.6189, + "step": 6990500 + }, + { + "epoch": 2.173254145767501, + "grad_norm": 9.311704635620117, + "learning_rate": 1.3779097570541646e-05, + "loss": 2.6471, + "step": 6991000 + }, + { + "epoch": 2.173409578047988, + "grad_norm": 10.369915008544922, + "learning_rate": 1.3776507032533531e-05, + "loss": 2.6882, + "step": 6991500 + }, + { + "epoch": 2.173565010328475, + "grad_norm": 9.009638786315918, + "learning_rate": 1.3773916494525415e-05, + "loss": 2.6401, + "step": 6992000 + }, + { + "epoch": 2.173720442608962, + "grad_norm": 14.259276390075684, + "learning_rate": 1.3771325956517304e-05, + "loss": 2.6659, + "step": 6992500 + }, + { + "epoch": 2.1738758748894487, + "grad_norm": 11.144628524780273, + "learning_rate": 1.3768735418509187e-05, + "loss": 2.668, + "step": 6993000 + }, + { + "epoch": 2.1740313071699355, + "grad_norm": 11.718795776367188, + "learning_rate": 1.3766144880501073e-05, + "loss": 2.7037, + "step": 6993500 + }, + { + "epoch": 2.1741867394504224, + "grad_norm": 10.059833526611328, + "learning_rate": 1.3763554342492957e-05, + "loss": 2.7104, + "step": 6994000 + }, + { + "epoch": 2.1743421717309093, + "grad_norm": 9.840580940246582, + "learning_rate": 1.3760963804484844e-05, + "loss": 2.6394, + "step": 6994500 + }, + { + "epoch": 2.174497604011396, + "grad_norm": 11.380404472351074, + "learning_rate": 1.375837326647673e-05, + "loss": 2.653, + "step": 6995000 + }, + { + "epoch": 2.174653036291883, + "grad_norm": 10.061885833740234, + "learning_rate": 1.3755782728468613e-05, + "loss": 2.6832, + "step": 6995500 + }, + { + "epoch": 2.17480846857237, + "grad_norm": 11.792829513549805, + "learning_rate": 1.37531921904605e-05, + "loss": 2.6811, + "step": 6996000 + }, + { + "epoch": 2.1749639008528567, + "grad_norm": 9.450597763061523, + "learning_rate": 1.3750601652452386e-05, + "loss": 2.6559, + "step": 6996500 + }, + { + "epoch": 2.1751193331333436, + "grad_norm": 9.572271347045898, + "learning_rate": 1.374801111444427e-05, + "loss": 2.6259, + "step": 6997000 + }, + { + "epoch": 2.1752747654138305, + "grad_norm": 9.587386131286621, + "learning_rate": 1.3745420576436155e-05, + "loss": 2.7113, + "step": 6997500 + }, + { + "epoch": 2.1754301976943173, + "grad_norm": 7.406138896942139, + "learning_rate": 1.3742830038428042e-05, + "loss": 2.6592, + "step": 6998000 + }, + { + "epoch": 2.1755856299748046, + "grad_norm": 10.020125389099121, + "learning_rate": 1.3740239500419927e-05, + "loss": 2.6575, + "step": 6998500 + }, + { + "epoch": 2.175741062255291, + "grad_norm": 9.159137725830078, + "learning_rate": 1.3737648962411811e-05, + "loss": 2.6768, + "step": 6999000 + }, + { + "epoch": 2.1758964945357784, + "grad_norm": 8.58443832397461, + "learning_rate": 1.3735058424403698e-05, + "loss": 2.6807, + "step": 6999500 + }, + { + "epoch": 2.1760519268162652, + "grad_norm": 9.474591255187988, + "learning_rate": 1.3732467886395584e-05, + "loss": 2.6635, + "step": 7000000 + }, + { + "epoch": 2.176207359096752, + "grad_norm": 9.845407485961914, + "learning_rate": 1.3729877348387468e-05, + "loss": 2.661, + "step": 7000500 + }, + { + "epoch": 2.176362791377239, + "grad_norm": 15.079900741577148, + "learning_rate": 1.3727286810379353e-05, + "loss": 2.6669, + "step": 7001000 + }, + { + "epoch": 2.176518223657726, + "grad_norm": 11.916092872619629, + "learning_rate": 1.372469627237124e-05, + "loss": 2.6339, + "step": 7001500 + }, + { + "epoch": 2.1766736559382127, + "grad_norm": 11.26401424407959, + "learning_rate": 1.3722105734363126e-05, + "loss": 2.673, + "step": 7002000 + }, + { + "epoch": 2.1768290882186996, + "grad_norm": 10.04964828491211, + "learning_rate": 1.371951519635501e-05, + "loss": 2.6593, + "step": 7002500 + }, + { + "epoch": 2.1769845204991864, + "grad_norm": 9.35595989227295, + "learning_rate": 1.3716924658346895e-05, + "loss": 2.684, + "step": 7003000 + }, + { + "epoch": 2.1771399527796733, + "grad_norm": 11.470810890197754, + "learning_rate": 1.3714334120338782e-05, + "loss": 2.6304, + "step": 7003500 + }, + { + "epoch": 2.17729538506016, + "grad_norm": 8.918736457824707, + "learning_rate": 1.3711743582330666e-05, + "loss": 2.6837, + "step": 7004000 + }, + { + "epoch": 2.177450817340647, + "grad_norm": 8.253778457641602, + "learning_rate": 1.3709153044322551e-05, + "loss": 2.6087, + "step": 7004500 + }, + { + "epoch": 2.177606249621134, + "grad_norm": 10.377737045288086, + "learning_rate": 1.3706562506314438e-05, + "loss": 2.6771, + "step": 7005000 + }, + { + "epoch": 2.177761681901621, + "grad_norm": 14.320731163024902, + "learning_rate": 1.3703971968306322e-05, + "loss": 2.6545, + "step": 7005500 + }, + { + "epoch": 2.1779171141821076, + "grad_norm": 29.314109802246094, + "learning_rate": 1.3701381430298208e-05, + "loss": 2.6495, + "step": 7006000 + }, + { + "epoch": 2.1780725464625945, + "grad_norm": 8.779444694519043, + "learning_rate": 1.3698790892290091e-05, + "loss": 2.6691, + "step": 7006500 + }, + { + "epoch": 2.1782279787430814, + "grad_norm": 9.215763092041016, + "learning_rate": 1.369620035428198e-05, + "loss": 2.631, + "step": 7007000 + }, + { + "epoch": 2.1783834110235683, + "grad_norm": 9.784106254577637, + "learning_rate": 1.3693609816273864e-05, + "loss": 2.6573, + "step": 7007500 + }, + { + "epoch": 2.178538843304055, + "grad_norm": 9.973637580871582, + "learning_rate": 1.369101927826575e-05, + "loss": 2.6694, + "step": 7008000 + }, + { + "epoch": 2.178694275584542, + "grad_norm": 10.152318954467773, + "learning_rate": 1.3688428740257637e-05, + "loss": 2.6499, + "step": 7008500 + }, + { + "epoch": 2.178849707865029, + "grad_norm": 12.104337692260742, + "learning_rate": 1.368583820224952e-05, + "loss": 2.6667, + "step": 7009000 + }, + { + "epoch": 2.1790051401455157, + "grad_norm": 8.524500846862793, + "learning_rate": 1.3683247664241406e-05, + "loss": 2.621, + "step": 7009500 + }, + { + "epoch": 2.1791605724260026, + "grad_norm": 9.416132926940918, + "learning_rate": 1.368065712623329e-05, + "loss": 2.6474, + "step": 7010000 + }, + { + "epoch": 2.1793160047064895, + "grad_norm": 9.715571403503418, + "learning_rate": 1.3678066588225177e-05, + "loss": 2.6569, + "step": 7010500 + }, + { + "epoch": 2.1794714369869763, + "grad_norm": 14.368125915527344, + "learning_rate": 1.3675476050217062e-05, + "loss": 2.7021, + "step": 7011000 + }, + { + "epoch": 2.179626869267463, + "grad_norm": 9.967904090881348, + "learning_rate": 1.3672885512208946e-05, + "loss": 2.6723, + "step": 7011500 + }, + { + "epoch": 2.17978230154795, + "grad_norm": 8.422673225402832, + "learning_rate": 1.3670294974200831e-05, + "loss": 2.6454, + "step": 7012000 + }, + { + "epoch": 2.179937733828437, + "grad_norm": 8.473954200744629, + "learning_rate": 1.3667704436192719e-05, + "loss": 2.6714, + "step": 7012500 + }, + { + "epoch": 2.180093166108924, + "grad_norm": 31.759313583374023, + "learning_rate": 1.3665113898184604e-05, + "loss": 2.6485, + "step": 7013000 + }, + { + "epoch": 2.1802485983894107, + "grad_norm": 9.590154647827148, + "learning_rate": 1.3662523360176488e-05, + "loss": 2.6376, + "step": 7013500 + }, + { + "epoch": 2.1804040306698975, + "grad_norm": 9.388434410095215, + "learning_rate": 1.3659932822168375e-05, + "loss": 2.6196, + "step": 7014000 + }, + { + "epoch": 2.1805594629503844, + "grad_norm": 10.180109024047852, + "learning_rate": 1.365734228416026e-05, + "loss": 2.723, + "step": 7014500 + }, + { + "epoch": 2.1807148952308713, + "grad_norm": 19.871997833251953, + "learning_rate": 1.3654751746152144e-05, + "loss": 2.6109, + "step": 7015000 + }, + { + "epoch": 2.180870327511358, + "grad_norm": 10.461889266967773, + "learning_rate": 1.365216120814403e-05, + "loss": 2.6804, + "step": 7015500 + }, + { + "epoch": 2.181025759791845, + "grad_norm": 7.714508056640625, + "learning_rate": 1.3649570670135917e-05, + "loss": 2.6987, + "step": 7016000 + }, + { + "epoch": 2.181181192072332, + "grad_norm": 10.878181457519531, + "learning_rate": 1.36469801321278e-05, + "loss": 2.6877, + "step": 7016500 + }, + { + "epoch": 2.1813366243528187, + "grad_norm": 9.036035537719727, + "learning_rate": 1.3644389594119686e-05, + "loss": 2.7095, + "step": 7017000 + }, + { + "epoch": 2.1814920566333056, + "grad_norm": 9.340529441833496, + "learning_rate": 1.3641799056111573e-05, + "loss": 2.6563, + "step": 7017500 + }, + { + "epoch": 2.1816474889137925, + "grad_norm": 13.36214828491211, + "learning_rate": 1.3639208518103459e-05, + "loss": 2.6937, + "step": 7018000 + }, + { + "epoch": 2.1818029211942793, + "grad_norm": 50.73941421508789, + "learning_rate": 1.3636617980095342e-05, + "loss": 2.6911, + "step": 7018500 + }, + { + "epoch": 2.181958353474766, + "grad_norm": 9.218355178833008, + "learning_rate": 1.3634027442087228e-05, + "loss": 2.5956, + "step": 7019000 + }, + { + "epoch": 2.182113785755253, + "grad_norm": 10.641563415527344, + "learning_rate": 1.3631436904079115e-05, + "loss": 2.7118, + "step": 7019500 + }, + { + "epoch": 2.18226921803574, + "grad_norm": 8.882567405700684, + "learning_rate": 1.3628846366070999e-05, + "loss": 2.7047, + "step": 7020000 + }, + { + "epoch": 2.182424650316227, + "grad_norm": 11.34891128540039, + "learning_rate": 1.3626255828062884e-05, + "loss": 2.6474, + "step": 7020500 + }, + { + "epoch": 2.1825800825967137, + "grad_norm": 10.218010902404785, + "learning_rate": 1.3623665290054768e-05, + "loss": 2.6813, + "step": 7021000 + }, + { + "epoch": 2.1827355148772005, + "grad_norm": 7.751388072967529, + "learning_rate": 1.3621074752046655e-05, + "loss": 2.6829, + "step": 7021500 + }, + { + "epoch": 2.182890947157688, + "grad_norm": 10.916376113891602, + "learning_rate": 1.361848421403854e-05, + "loss": 2.6588, + "step": 7022000 + }, + { + "epoch": 2.1830463794381743, + "grad_norm": 10.56480598449707, + "learning_rate": 1.3615893676030424e-05, + "loss": 2.6935, + "step": 7022500 + }, + { + "epoch": 2.1832018117186616, + "grad_norm": 9.193339347839355, + "learning_rate": 1.3613303138022313e-05, + "loss": 2.7023, + "step": 7023000 + }, + { + "epoch": 2.1833572439991484, + "grad_norm": 8.485828399658203, + "learning_rate": 1.3610712600014197e-05, + "loss": 2.6712, + "step": 7023500 + }, + { + "epoch": 2.1835126762796353, + "grad_norm": 9.43916130065918, + "learning_rate": 1.3608122062006083e-05, + "loss": 2.6874, + "step": 7024000 + }, + { + "epoch": 2.183668108560122, + "grad_norm": 12.005130767822266, + "learning_rate": 1.3605531523997966e-05, + "loss": 2.663, + "step": 7024500 + }, + { + "epoch": 2.183823540840609, + "grad_norm": 8.468267440795898, + "learning_rate": 1.3602940985989853e-05, + "loss": 2.6416, + "step": 7025000 + }, + { + "epoch": 2.183978973121096, + "grad_norm": 16.097745895385742, + "learning_rate": 1.3600350447981739e-05, + "loss": 2.6701, + "step": 7025500 + }, + { + "epoch": 2.1841344054015828, + "grad_norm": 20.33794593811035, + "learning_rate": 1.3597759909973623e-05, + "loss": 2.6358, + "step": 7026000 + }, + { + "epoch": 2.1842898376820696, + "grad_norm": 14.92019271850586, + "learning_rate": 1.359516937196551e-05, + "loss": 2.6692, + "step": 7026500 + }, + { + "epoch": 2.1844452699625565, + "grad_norm": 8.611408233642578, + "learning_rate": 1.3592578833957395e-05, + "loss": 2.7004, + "step": 7027000 + }, + { + "epoch": 2.1846007022430434, + "grad_norm": 10.830490112304688, + "learning_rate": 1.3589988295949279e-05, + "loss": 2.6866, + "step": 7027500 + }, + { + "epoch": 2.1847561345235302, + "grad_norm": 12.053689002990723, + "learning_rate": 1.3587397757941165e-05, + "loss": 2.6693, + "step": 7028000 + }, + { + "epoch": 2.184911566804017, + "grad_norm": 9.239320755004883, + "learning_rate": 1.3584807219933052e-05, + "loss": 2.6646, + "step": 7028500 + }, + { + "epoch": 2.185066999084504, + "grad_norm": 10.133193969726562, + "learning_rate": 1.3582216681924937e-05, + "loss": 2.709, + "step": 7029000 + }, + { + "epoch": 2.185222431364991, + "grad_norm": 10.14928150177002, + "learning_rate": 1.3579626143916821e-05, + "loss": 2.6344, + "step": 7029500 + }, + { + "epoch": 2.1853778636454777, + "grad_norm": 9.459968566894531, + "learning_rate": 1.3577035605908706e-05, + "loss": 2.6167, + "step": 7030000 + }, + { + "epoch": 2.1855332959259646, + "grad_norm": 10.958450317382812, + "learning_rate": 1.3574445067900593e-05, + "loss": 2.7164, + "step": 7030500 + }, + { + "epoch": 2.1856887282064514, + "grad_norm": 32.44102478027344, + "learning_rate": 1.3571854529892477e-05, + "loss": 2.6329, + "step": 7031000 + }, + { + "epoch": 2.1858441604869383, + "grad_norm": 23.920551300048828, + "learning_rate": 1.3569263991884363e-05, + "loss": 2.649, + "step": 7031500 + }, + { + "epoch": 2.185999592767425, + "grad_norm": 9.104080200195312, + "learning_rate": 1.356667345387625e-05, + "loss": 2.6569, + "step": 7032000 + }, + { + "epoch": 2.186155025047912, + "grad_norm": 39.87963104248047, + "learning_rate": 1.3564082915868134e-05, + "loss": 2.6825, + "step": 7032500 + }, + { + "epoch": 2.186310457328399, + "grad_norm": 10.876375198364258, + "learning_rate": 1.3561492377860019e-05, + "loss": 2.6841, + "step": 7033000 + }, + { + "epoch": 2.1864658896088858, + "grad_norm": 7.452424049377441, + "learning_rate": 1.3558901839851903e-05, + "loss": 2.6176, + "step": 7033500 + }, + { + "epoch": 2.1866213218893726, + "grad_norm": 13.519349098205566, + "learning_rate": 1.3556311301843792e-05, + "loss": 2.6384, + "step": 7034000 + }, + { + "epoch": 2.1867767541698595, + "grad_norm": 10.021020889282227, + "learning_rate": 1.3553720763835675e-05, + "loss": 2.6869, + "step": 7034500 + }, + { + "epoch": 2.1869321864503464, + "grad_norm": 12.853666305541992, + "learning_rate": 1.3551130225827561e-05, + "loss": 2.656, + "step": 7035000 + }, + { + "epoch": 2.1870876187308332, + "grad_norm": 9.681904792785645, + "learning_rate": 1.3548539687819448e-05, + "loss": 2.6509, + "step": 7035500 + }, + { + "epoch": 2.18724305101132, + "grad_norm": 30.20355796813965, + "learning_rate": 1.3545949149811332e-05, + "loss": 2.6384, + "step": 7036000 + }, + { + "epoch": 2.187398483291807, + "grad_norm": 9.20132827758789, + "learning_rate": 1.3543358611803217e-05, + "loss": 2.6759, + "step": 7036500 + }, + { + "epoch": 2.187553915572294, + "grad_norm": 8.612076759338379, + "learning_rate": 1.3540768073795101e-05, + "loss": 2.6692, + "step": 7037000 + }, + { + "epoch": 2.1877093478527807, + "grad_norm": 13.89963436126709, + "learning_rate": 1.3538177535786988e-05, + "loss": 2.6389, + "step": 7037500 + }, + { + "epoch": 2.1878647801332676, + "grad_norm": 30.52775764465332, + "learning_rate": 1.3535586997778874e-05, + "loss": 2.66, + "step": 7038000 + }, + { + "epoch": 2.1880202124137544, + "grad_norm": 10.61279010772705, + "learning_rate": 1.3532996459770757e-05, + "loss": 2.6363, + "step": 7038500 + }, + { + "epoch": 2.1881756446942413, + "grad_norm": 15.900796890258789, + "learning_rate": 1.3530405921762646e-05, + "loss": 2.6748, + "step": 7039000 + }, + { + "epoch": 2.188331076974728, + "grad_norm": 15.614459037780762, + "learning_rate": 1.352781538375453e-05, + "loss": 2.6345, + "step": 7039500 + }, + { + "epoch": 2.188486509255215, + "grad_norm": 10.560728073120117, + "learning_rate": 1.3525224845746416e-05, + "loss": 2.6707, + "step": 7040000 + }, + { + "epoch": 2.188641941535702, + "grad_norm": 13.708731651306152, + "learning_rate": 1.35226343077383e-05, + "loss": 2.645, + "step": 7040500 + }, + { + "epoch": 2.1887973738161888, + "grad_norm": 15.682226181030273, + "learning_rate": 1.3520043769730186e-05, + "loss": 2.6322, + "step": 7041000 + }, + { + "epoch": 2.1889528060966756, + "grad_norm": 10.391497611999512, + "learning_rate": 1.3517453231722072e-05, + "loss": 2.6466, + "step": 7041500 + }, + { + "epoch": 2.1891082383771625, + "grad_norm": 10.986344337463379, + "learning_rate": 1.3514862693713956e-05, + "loss": 2.6263, + "step": 7042000 + }, + { + "epoch": 2.1892636706576494, + "grad_norm": 12.15768814086914, + "learning_rate": 1.3512272155705841e-05, + "loss": 2.6226, + "step": 7042500 + }, + { + "epoch": 2.1894191029381362, + "grad_norm": 12.12785530090332, + "learning_rate": 1.3509681617697728e-05, + "loss": 2.6993, + "step": 7043000 + }, + { + "epoch": 2.189574535218623, + "grad_norm": 9.928520202636719, + "learning_rate": 1.3507091079689612e-05, + "loss": 2.7226, + "step": 7043500 + }, + { + "epoch": 2.18972996749911, + "grad_norm": 9.918954849243164, + "learning_rate": 1.3504500541681498e-05, + "loss": 2.6711, + "step": 7044000 + }, + { + "epoch": 2.189885399779597, + "grad_norm": 11.794477462768555, + "learning_rate": 1.3501910003673385e-05, + "loss": 2.6543, + "step": 7044500 + }, + { + "epoch": 2.1900408320600837, + "grad_norm": 8.638436317443848, + "learning_rate": 1.349931946566527e-05, + "loss": 2.6818, + "step": 7045000 + }, + { + "epoch": 2.1901962643405706, + "grad_norm": 8.974214553833008, + "learning_rate": 1.3496728927657154e-05, + "loss": 2.7164, + "step": 7045500 + }, + { + "epoch": 2.1903516966210574, + "grad_norm": 14.348254203796387, + "learning_rate": 1.349413838964904e-05, + "loss": 2.6502, + "step": 7046000 + }, + { + "epoch": 2.1905071289015448, + "grad_norm": 7.991842746734619, + "learning_rate": 1.3491547851640926e-05, + "loss": 2.6297, + "step": 7046500 + }, + { + "epoch": 2.190662561182031, + "grad_norm": 15.513423919677734, + "learning_rate": 1.348895731363281e-05, + "loss": 2.6628, + "step": 7047000 + }, + { + "epoch": 2.1908179934625185, + "grad_norm": 9.414554595947266, + "learning_rate": 1.3486366775624696e-05, + "loss": 2.6256, + "step": 7047500 + }, + { + "epoch": 2.1909734257430054, + "grad_norm": 7.971940517425537, + "learning_rate": 1.3483776237616583e-05, + "loss": 2.6849, + "step": 7048000 + }, + { + "epoch": 2.191128858023492, + "grad_norm": 10.726962089538574, + "learning_rate": 1.3481185699608467e-05, + "loss": 2.6487, + "step": 7048500 + }, + { + "epoch": 2.191284290303979, + "grad_norm": 9.91797924041748, + "learning_rate": 1.3478595161600352e-05, + "loss": 2.6934, + "step": 7049000 + }, + { + "epoch": 2.191439722584466, + "grad_norm": 12.372905731201172, + "learning_rate": 1.3476004623592236e-05, + "loss": 2.7151, + "step": 7049500 + }, + { + "epoch": 2.191595154864953, + "grad_norm": 10.205401420593262, + "learning_rate": 1.3473414085584125e-05, + "loss": 2.6493, + "step": 7050000 + }, + { + "epoch": 2.1917505871454397, + "grad_norm": 9.921273231506348, + "learning_rate": 1.3470823547576008e-05, + "loss": 2.6363, + "step": 7050500 + }, + { + "epoch": 2.1919060194259266, + "grad_norm": 8.900282859802246, + "learning_rate": 1.3468233009567894e-05, + "loss": 2.698, + "step": 7051000 + }, + { + "epoch": 2.1920614517064134, + "grad_norm": 8.655560493469238, + "learning_rate": 1.3465642471559778e-05, + "loss": 2.6813, + "step": 7051500 + }, + { + "epoch": 2.1922168839869003, + "grad_norm": 12.135315895080566, + "learning_rate": 1.3463051933551665e-05, + "loss": 2.6821, + "step": 7052000 + }, + { + "epoch": 2.192372316267387, + "grad_norm": 22.818681716918945, + "learning_rate": 1.346046139554355e-05, + "loss": 2.6539, + "step": 7052500 + }, + { + "epoch": 2.192527748547874, + "grad_norm": 6.710369110107422, + "learning_rate": 1.3457870857535434e-05, + "loss": 2.6506, + "step": 7053000 + }, + { + "epoch": 2.192683180828361, + "grad_norm": 12.172780990600586, + "learning_rate": 1.3455280319527321e-05, + "loss": 2.6652, + "step": 7053500 + }, + { + "epoch": 2.1928386131088478, + "grad_norm": 8.803936958312988, + "learning_rate": 1.3452689781519207e-05, + "loss": 2.6826, + "step": 7054000 + }, + { + "epoch": 2.1929940453893346, + "grad_norm": 26.752979278564453, + "learning_rate": 1.345009924351109e-05, + "loss": 2.6466, + "step": 7054500 + }, + { + "epoch": 2.1931494776698215, + "grad_norm": 9.92175579071045, + "learning_rate": 1.3447508705502976e-05, + "loss": 2.6605, + "step": 7055000 + }, + { + "epoch": 2.1933049099503084, + "grad_norm": 10.79509449005127, + "learning_rate": 1.3444918167494863e-05, + "loss": 2.6609, + "step": 7055500 + }, + { + "epoch": 2.1934603422307952, + "grad_norm": 12.608068466186523, + "learning_rate": 1.3442327629486749e-05, + "loss": 2.6925, + "step": 7056000 + }, + { + "epoch": 2.193615774511282, + "grad_norm": 9.745149612426758, + "learning_rate": 1.3439737091478632e-05, + "loss": 2.6074, + "step": 7056500 + }, + { + "epoch": 2.193771206791769, + "grad_norm": 10.071015357971191, + "learning_rate": 1.343714655347052e-05, + "loss": 2.628, + "step": 7057000 + }, + { + "epoch": 2.193926639072256, + "grad_norm": 7.753749847412109, + "learning_rate": 1.3434556015462405e-05, + "loss": 2.6427, + "step": 7057500 + }, + { + "epoch": 2.1940820713527427, + "grad_norm": 11.176605224609375, + "learning_rate": 1.3431965477454289e-05, + "loss": 2.642, + "step": 7058000 + }, + { + "epoch": 2.1942375036332296, + "grad_norm": 8.374770164489746, + "learning_rate": 1.3429374939446174e-05, + "loss": 2.6457, + "step": 7058500 + }, + { + "epoch": 2.1943929359137164, + "grad_norm": 5.9833984375, + "learning_rate": 1.3426784401438061e-05, + "loss": 2.6459, + "step": 7059000 + }, + { + "epoch": 2.1945483681942033, + "grad_norm": 12.300938606262207, + "learning_rate": 1.3424193863429945e-05, + "loss": 2.6573, + "step": 7059500 + }, + { + "epoch": 2.19470380047469, + "grad_norm": 10.516050338745117, + "learning_rate": 1.342160332542183e-05, + "loss": 2.6434, + "step": 7060000 + }, + { + "epoch": 2.194859232755177, + "grad_norm": 10.50892448425293, + "learning_rate": 1.3419012787413714e-05, + "loss": 2.624, + "step": 7060500 + }, + { + "epoch": 2.195014665035664, + "grad_norm": 11.6234769821167, + "learning_rate": 1.3416422249405603e-05, + "loss": 2.6522, + "step": 7061000 + }, + { + "epoch": 2.1951700973161508, + "grad_norm": 18.816436767578125, + "learning_rate": 1.3413831711397487e-05, + "loss": 2.6463, + "step": 7061500 + }, + { + "epoch": 2.1953255295966376, + "grad_norm": 8.69320297241211, + "learning_rate": 1.3411241173389372e-05, + "loss": 2.6646, + "step": 7062000 + }, + { + "epoch": 2.1954809618771245, + "grad_norm": 12.6898775100708, + "learning_rate": 1.340865063538126e-05, + "loss": 2.6174, + "step": 7062500 + }, + { + "epoch": 2.1956363941576114, + "grad_norm": 12.133907318115234, + "learning_rate": 1.3406060097373143e-05, + "loss": 2.6762, + "step": 7063000 + }, + { + "epoch": 2.1957918264380982, + "grad_norm": 9.472986221313477, + "learning_rate": 1.3403469559365029e-05, + "loss": 2.696, + "step": 7063500 + }, + { + "epoch": 2.195947258718585, + "grad_norm": 11.275157928466797, + "learning_rate": 1.3400879021356912e-05, + "loss": 2.6309, + "step": 7064000 + }, + { + "epoch": 2.196102690999072, + "grad_norm": 8.9284029006958, + "learning_rate": 1.33982884833488e-05, + "loss": 2.6737, + "step": 7064500 + }, + { + "epoch": 2.196258123279559, + "grad_norm": 8.0679349899292, + "learning_rate": 1.3395697945340685e-05, + "loss": 2.7214, + "step": 7065000 + }, + { + "epoch": 2.1964135555600457, + "grad_norm": 9.481125831604004, + "learning_rate": 1.3393107407332569e-05, + "loss": 2.6358, + "step": 7065500 + }, + { + "epoch": 2.1965689878405326, + "grad_norm": 20.376585006713867, + "learning_rate": 1.3390516869324458e-05, + "loss": 2.6813, + "step": 7066000 + }, + { + "epoch": 2.1967244201210194, + "grad_norm": 8.453557968139648, + "learning_rate": 1.3387926331316341e-05, + "loss": 2.6248, + "step": 7066500 + }, + { + "epoch": 2.1968798524015063, + "grad_norm": 38.52239990234375, + "learning_rate": 1.3385335793308227e-05, + "loss": 2.6309, + "step": 7067000 + }, + { + "epoch": 2.197035284681993, + "grad_norm": 29.72000503540039, + "learning_rate": 1.338274525530011e-05, + "loss": 2.6534, + "step": 7067500 + }, + { + "epoch": 2.19719071696248, + "grad_norm": 12.98536491394043, + "learning_rate": 1.3380154717291998e-05, + "loss": 2.6621, + "step": 7068000 + }, + { + "epoch": 2.197346149242967, + "grad_norm": 15.908971786499023, + "learning_rate": 1.3377564179283883e-05, + "loss": 2.6612, + "step": 7068500 + }, + { + "epoch": 2.1975015815234538, + "grad_norm": 14.219606399536133, + "learning_rate": 1.3374973641275767e-05, + "loss": 2.6586, + "step": 7069000 + }, + { + "epoch": 2.1976570138039406, + "grad_norm": 16.307552337646484, + "learning_rate": 1.3372383103267653e-05, + "loss": 2.6131, + "step": 7069500 + }, + { + "epoch": 2.197812446084428, + "grad_norm": 13.723237991333008, + "learning_rate": 1.336979256525954e-05, + "loss": 2.6601, + "step": 7070000 + }, + { + "epoch": 2.1979678783649144, + "grad_norm": 12.298934936523438, + "learning_rate": 1.3367202027251423e-05, + "loss": 2.6673, + "step": 7070500 + }, + { + "epoch": 2.1981233106454017, + "grad_norm": 10.560261726379395, + "learning_rate": 1.3364611489243309e-05, + "loss": 2.6998, + "step": 7071000 + }, + { + "epoch": 2.1982787429258885, + "grad_norm": 8.21324634552002, + "learning_rate": 1.3362020951235196e-05, + "loss": 2.6801, + "step": 7071500 + }, + { + "epoch": 2.1984341752063754, + "grad_norm": 9.842781066894531, + "learning_rate": 1.3359430413227082e-05, + "loss": 2.6748, + "step": 7072000 + }, + { + "epoch": 2.1985896074868623, + "grad_norm": 9.256730079650879, + "learning_rate": 1.3356839875218965e-05, + "loss": 2.6959, + "step": 7072500 + }, + { + "epoch": 2.198745039767349, + "grad_norm": 11.149618148803711, + "learning_rate": 1.335424933721085e-05, + "loss": 2.6172, + "step": 7073000 + }, + { + "epoch": 2.198900472047836, + "grad_norm": 9.540712356567383, + "learning_rate": 1.3351658799202738e-05, + "loss": 2.6593, + "step": 7073500 + }, + { + "epoch": 2.199055904328323, + "grad_norm": 8.328335762023926, + "learning_rate": 1.3349068261194622e-05, + "loss": 2.6788, + "step": 7074000 + }, + { + "epoch": 2.1992113366088097, + "grad_norm": 13.652259826660156, + "learning_rate": 1.3346477723186507e-05, + "loss": 2.7029, + "step": 7074500 + }, + { + "epoch": 2.1993667688892966, + "grad_norm": 33.676856994628906, + "learning_rate": 1.3343887185178394e-05, + "loss": 2.6632, + "step": 7075000 + }, + { + "epoch": 2.1995222011697835, + "grad_norm": 8.90576457977295, + "learning_rate": 1.3341296647170278e-05, + "loss": 2.6103, + "step": 7075500 + }, + { + "epoch": 2.1996776334502703, + "grad_norm": 6.562003135681152, + "learning_rate": 1.3338706109162164e-05, + "loss": 2.6411, + "step": 7076000 + }, + { + "epoch": 2.199833065730757, + "grad_norm": 12.96711540222168, + "learning_rate": 1.3336115571154047e-05, + "loss": 2.6374, + "step": 7076500 + }, + { + "epoch": 2.199988498011244, + "grad_norm": 40.724456787109375, + "learning_rate": 1.3333525033145936e-05, + "loss": 2.7034, + "step": 7077000 + }, + { + "epoch": 2.200143930291731, + "grad_norm": 10.821879386901855, + "learning_rate": 1.333093449513782e-05, + "loss": 2.6559, + "step": 7077500 + }, + { + "epoch": 2.200299362572218, + "grad_norm": 11.61991024017334, + "learning_rate": 1.3328343957129705e-05, + "loss": 2.6604, + "step": 7078000 + }, + { + "epoch": 2.2004547948527047, + "grad_norm": 9.545774459838867, + "learning_rate": 1.3325753419121589e-05, + "loss": 2.6859, + "step": 7078500 + }, + { + "epoch": 2.2006102271331915, + "grad_norm": 9.139541625976562, + "learning_rate": 1.3323162881113476e-05, + "loss": 2.6459, + "step": 7079000 + }, + { + "epoch": 2.2007656594136784, + "grad_norm": 10.129974365234375, + "learning_rate": 1.3320572343105362e-05, + "loss": 2.6588, + "step": 7079500 + }, + { + "epoch": 2.2009210916941653, + "grad_norm": 8.717951774597168, + "learning_rate": 1.3317981805097245e-05, + "loss": 2.6504, + "step": 7080000 + }, + { + "epoch": 2.201076523974652, + "grad_norm": 8.287983894348145, + "learning_rate": 1.3315391267089133e-05, + "loss": 2.6915, + "step": 7080500 + }, + { + "epoch": 2.201231956255139, + "grad_norm": 8.163957595825195, + "learning_rate": 1.3312800729081018e-05, + "loss": 2.7018, + "step": 7081000 + }, + { + "epoch": 2.201387388535626, + "grad_norm": 9.097160339355469, + "learning_rate": 1.3310210191072902e-05, + "loss": 2.638, + "step": 7081500 + }, + { + "epoch": 2.2015428208161127, + "grad_norm": 14.175742149353027, + "learning_rate": 1.3307619653064787e-05, + "loss": 2.6601, + "step": 7082000 + }, + { + "epoch": 2.2016982530965996, + "grad_norm": 10.205787658691406, + "learning_rate": 1.3305029115056674e-05, + "loss": 2.6418, + "step": 7082500 + }, + { + "epoch": 2.2018536853770865, + "grad_norm": 10.250246047973633, + "learning_rate": 1.330243857704856e-05, + "loss": 2.6725, + "step": 7083000 + }, + { + "epoch": 2.2020091176575733, + "grad_norm": 9.245697021484375, + "learning_rate": 1.3299848039040444e-05, + "loss": 2.6409, + "step": 7083500 + }, + { + "epoch": 2.20216454993806, + "grad_norm": 28.209989547729492, + "learning_rate": 1.3297257501032331e-05, + "loss": 2.6303, + "step": 7084000 + }, + { + "epoch": 2.202319982218547, + "grad_norm": 9.709586143493652, + "learning_rate": 1.3294666963024216e-05, + "loss": 2.6625, + "step": 7084500 + }, + { + "epoch": 2.202475414499034, + "grad_norm": 9.015283584594727, + "learning_rate": 1.32920764250161e-05, + "loss": 2.6249, + "step": 7085000 + }, + { + "epoch": 2.202630846779521, + "grad_norm": 11.435638427734375, + "learning_rate": 1.3289485887007986e-05, + "loss": 2.6575, + "step": 7085500 + }, + { + "epoch": 2.2027862790600077, + "grad_norm": 9.791678428649902, + "learning_rate": 1.3286895348999873e-05, + "loss": 2.6724, + "step": 7086000 + }, + { + "epoch": 2.2029417113404945, + "grad_norm": 11.416150093078613, + "learning_rate": 1.3284304810991756e-05, + "loss": 2.6641, + "step": 7086500 + }, + { + "epoch": 2.2030971436209814, + "grad_norm": 11.11033821105957, + "learning_rate": 1.3281714272983642e-05, + "loss": 2.6809, + "step": 7087000 + }, + { + "epoch": 2.2032525759014683, + "grad_norm": 11.214434623718262, + "learning_rate": 1.3279123734975526e-05, + "loss": 2.6952, + "step": 7087500 + }, + { + "epoch": 2.203408008181955, + "grad_norm": 9.405591011047363, + "learning_rate": 1.3276533196967415e-05, + "loss": 2.6479, + "step": 7088000 + }, + { + "epoch": 2.203563440462442, + "grad_norm": 22.370445251464844, + "learning_rate": 1.3273942658959298e-05, + "loss": 2.6699, + "step": 7088500 + }, + { + "epoch": 2.203718872742929, + "grad_norm": 15.352683067321777, + "learning_rate": 1.3271352120951184e-05, + "loss": 2.6344, + "step": 7089000 + }, + { + "epoch": 2.2038743050234157, + "grad_norm": 10.265692710876465, + "learning_rate": 1.3268761582943071e-05, + "loss": 2.5964, + "step": 7089500 + }, + { + "epoch": 2.2040297373039026, + "grad_norm": 9.666911125183105, + "learning_rate": 1.3266171044934955e-05, + "loss": 2.6862, + "step": 7090000 + }, + { + "epoch": 2.2041851695843895, + "grad_norm": 8.8602933883667, + "learning_rate": 1.326358050692684e-05, + "loss": 2.6985, + "step": 7090500 + }, + { + "epoch": 2.2043406018648763, + "grad_norm": 10.075604438781738, + "learning_rate": 1.3260989968918724e-05, + "loss": 2.718, + "step": 7091000 + }, + { + "epoch": 2.204496034145363, + "grad_norm": 19.85933494567871, + "learning_rate": 1.3258399430910611e-05, + "loss": 2.6753, + "step": 7091500 + }, + { + "epoch": 2.20465146642585, + "grad_norm": 10.843159675598145, + "learning_rate": 1.3255808892902497e-05, + "loss": 2.6653, + "step": 7092000 + }, + { + "epoch": 2.204806898706337, + "grad_norm": 9.655806541442871, + "learning_rate": 1.325321835489438e-05, + "loss": 2.6739, + "step": 7092500 + }, + { + "epoch": 2.204962330986824, + "grad_norm": 34.20943832397461, + "learning_rate": 1.3250627816886269e-05, + "loss": 2.6609, + "step": 7093000 + }, + { + "epoch": 2.2051177632673107, + "grad_norm": 9.364381790161133, + "learning_rate": 1.3248037278878153e-05, + "loss": 2.6251, + "step": 7093500 + }, + { + "epoch": 2.2052731955477975, + "grad_norm": 18.14218521118164, + "learning_rate": 1.3245446740870038e-05, + "loss": 2.6359, + "step": 7094000 + }, + { + "epoch": 2.205428627828285, + "grad_norm": 15.10971736907959, + "learning_rate": 1.3242856202861922e-05, + "loss": 2.6699, + "step": 7094500 + }, + { + "epoch": 2.2055840601087713, + "grad_norm": 6.072082042694092, + "learning_rate": 1.324026566485381e-05, + "loss": 2.6474, + "step": 7095000 + }, + { + "epoch": 2.2057394923892586, + "grad_norm": 9.664881706237793, + "learning_rate": 1.3237675126845695e-05, + "loss": 2.647, + "step": 7095500 + }, + { + "epoch": 2.2058949246697455, + "grad_norm": 8.133036613464355, + "learning_rate": 1.3235084588837578e-05, + "loss": 2.6481, + "step": 7096000 + }, + { + "epoch": 2.2060503569502323, + "grad_norm": 36.668426513671875, + "learning_rate": 1.3232494050829464e-05, + "loss": 2.6478, + "step": 7096500 + }, + { + "epoch": 2.206205789230719, + "grad_norm": 8.996204376220703, + "learning_rate": 1.3229903512821351e-05, + "loss": 2.6516, + "step": 7097000 + }, + { + "epoch": 2.206361221511206, + "grad_norm": 10.864043235778809, + "learning_rate": 1.3227312974813237e-05, + "loss": 2.6374, + "step": 7097500 + }, + { + "epoch": 2.206516653791693, + "grad_norm": 8.256531715393066, + "learning_rate": 1.322472243680512e-05, + "loss": 2.6663, + "step": 7098000 + }, + { + "epoch": 2.20667208607218, + "grad_norm": 16.828075408935547, + "learning_rate": 1.3222131898797007e-05, + "loss": 2.681, + "step": 7098500 + }, + { + "epoch": 2.2068275183526667, + "grad_norm": 7.143847465515137, + "learning_rate": 1.3219541360788893e-05, + "loss": 2.6582, + "step": 7099000 + }, + { + "epoch": 2.2069829506331535, + "grad_norm": 9.364218711853027, + "learning_rate": 1.3216950822780777e-05, + "loss": 2.672, + "step": 7099500 + }, + { + "epoch": 2.2071383829136404, + "grad_norm": 9.647490501403809, + "learning_rate": 1.3214360284772662e-05, + "loss": 2.6532, + "step": 7100000 + }, + { + "epoch": 2.2072938151941273, + "grad_norm": 15.407587051391602, + "learning_rate": 1.321176974676455e-05, + "loss": 2.6188, + "step": 7100500 + }, + { + "epoch": 2.207449247474614, + "grad_norm": 25.738483428955078, + "learning_rate": 1.3209179208756433e-05, + "loss": 2.652, + "step": 7101000 + }, + { + "epoch": 2.207604679755101, + "grad_norm": 11.668415069580078, + "learning_rate": 1.3206588670748319e-05, + "loss": 2.6718, + "step": 7101500 + }, + { + "epoch": 2.207760112035588, + "grad_norm": 13.643402099609375, + "learning_rate": 1.3203998132740206e-05, + "loss": 2.6473, + "step": 7102000 + }, + { + "epoch": 2.2079155443160747, + "grad_norm": 8.84483814239502, + "learning_rate": 1.3201407594732091e-05, + "loss": 2.6298, + "step": 7102500 + }, + { + "epoch": 2.2080709765965616, + "grad_norm": 8.4900484085083, + "learning_rate": 1.3198817056723975e-05, + "loss": 2.6501, + "step": 7103000 + }, + { + "epoch": 2.2082264088770485, + "grad_norm": 8.539948463439941, + "learning_rate": 1.319622651871586e-05, + "loss": 2.6658, + "step": 7103500 + }, + { + "epoch": 2.2083818411575353, + "grad_norm": 34.620323181152344, + "learning_rate": 1.3193635980707748e-05, + "loss": 2.7065, + "step": 7104000 + }, + { + "epoch": 2.208537273438022, + "grad_norm": 9.107375144958496, + "learning_rate": 1.3191045442699631e-05, + "loss": 2.6967, + "step": 7104500 + }, + { + "epoch": 2.208692705718509, + "grad_norm": 13.108879089355469, + "learning_rate": 1.3188454904691517e-05, + "loss": 2.6796, + "step": 7105000 + }, + { + "epoch": 2.208848137998996, + "grad_norm": 14.021284103393555, + "learning_rate": 1.31858643666834e-05, + "loss": 2.6487, + "step": 7105500 + }, + { + "epoch": 2.209003570279483, + "grad_norm": 15.984251022338867, + "learning_rate": 1.3183273828675288e-05, + "loss": 2.6724, + "step": 7106000 + }, + { + "epoch": 2.2091590025599697, + "grad_norm": 11.001270294189453, + "learning_rate": 1.3180683290667173e-05, + "loss": 2.6615, + "step": 7106500 + }, + { + "epoch": 2.2093144348404565, + "grad_norm": 7.164456844329834, + "learning_rate": 1.3178092752659057e-05, + "loss": 2.6544, + "step": 7107000 + }, + { + "epoch": 2.2094698671209434, + "grad_norm": 9.85753345489502, + "learning_rate": 1.3175502214650946e-05, + "loss": 2.6931, + "step": 7107500 + }, + { + "epoch": 2.2096252994014303, + "grad_norm": 9.872026443481445, + "learning_rate": 1.317291167664283e-05, + "loss": 2.6334, + "step": 7108000 + }, + { + "epoch": 2.209780731681917, + "grad_norm": 12.99739933013916, + "learning_rate": 1.3170321138634715e-05, + "loss": 2.6669, + "step": 7108500 + }, + { + "epoch": 2.209936163962404, + "grad_norm": 8.976163864135742, + "learning_rate": 1.3167730600626599e-05, + "loss": 2.6549, + "step": 7109000 + }, + { + "epoch": 2.210091596242891, + "grad_norm": 9.080636024475098, + "learning_rate": 1.3165140062618486e-05, + "loss": 2.6856, + "step": 7109500 + }, + { + "epoch": 2.2102470285233777, + "grad_norm": 9.512466430664062, + "learning_rate": 1.3162549524610371e-05, + "loss": 2.6697, + "step": 7110000 + }, + { + "epoch": 2.2104024608038646, + "grad_norm": 10.8762845993042, + "learning_rate": 1.3159958986602255e-05, + "loss": 2.6588, + "step": 7110500 + }, + { + "epoch": 2.2105578930843515, + "grad_norm": 8.552327156066895, + "learning_rate": 1.3157368448594142e-05, + "loss": 2.6851, + "step": 7111000 + }, + { + "epoch": 2.2107133253648383, + "grad_norm": 9.673382759094238, + "learning_rate": 1.3154777910586028e-05, + "loss": 2.6609, + "step": 7111500 + }, + { + "epoch": 2.210868757645325, + "grad_norm": 10.953499794006348, + "learning_rate": 1.3152187372577912e-05, + "loss": 2.6659, + "step": 7112000 + }, + { + "epoch": 2.211024189925812, + "grad_norm": 12.391345024108887, + "learning_rate": 1.3149596834569797e-05, + "loss": 2.6859, + "step": 7112500 + }, + { + "epoch": 2.211179622206299, + "grad_norm": 9.602710723876953, + "learning_rate": 1.3147006296561684e-05, + "loss": 2.6749, + "step": 7113000 + }, + { + "epoch": 2.211335054486786, + "grad_norm": 10.36703872680664, + "learning_rate": 1.314441575855357e-05, + "loss": 2.6752, + "step": 7113500 + }, + { + "epoch": 2.2114904867672727, + "grad_norm": 9.62559700012207, + "learning_rate": 1.3141825220545453e-05, + "loss": 2.676, + "step": 7114000 + }, + { + "epoch": 2.2116459190477595, + "grad_norm": 8.67519760131836, + "learning_rate": 1.3139234682537339e-05, + "loss": 2.6905, + "step": 7114500 + }, + { + "epoch": 2.2118013513282464, + "grad_norm": 19.135648727416992, + "learning_rate": 1.3136644144529226e-05, + "loss": 2.6312, + "step": 7115000 + }, + { + "epoch": 2.2119567836087333, + "grad_norm": 10.126228332519531, + "learning_rate": 1.313405360652111e-05, + "loss": 2.7056, + "step": 7115500 + }, + { + "epoch": 2.21211221588922, + "grad_norm": 11.632248878479004, + "learning_rate": 1.3131463068512995e-05, + "loss": 2.6838, + "step": 7116000 + }, + { + "epoch": 2.212267648169707, + "grad_norm": 7.536538124084473, + "learning_rate": 1.3128872530504882e-05, + "loss": 2.6702, + "step": 7116500 + }, + { + "epoch": 2.212423080450194, + "grad_norm": 12.051187515258789, + "learning_rate": 1.3126281992496766e-05, + "loss": 2.6114, + "step": 7117000 + }, + { + "epoch": 2.2125785127306807, + "grad_norm": 22.48979949951172, + "learning_rate": 1.3123691454488652e-05, + "loss": 2.6843, + "step": 7117500 + }, + { + "epoch": 2.2127339450111676, + "grad_norm": 9.090034484863281, + "learning_rate": 1.3121100916480535e-05, + "loss": 2.645, + "step": 7118000 + }, + { + "epoch": 2.2128893772916545, + "grad_norm": 9.895238876342773, + "learning_rate": 1.3118510378472424e-05, + "loss": 2.7041, + "step": 7118500 + }, + { + "epoch": 2.213044809572142, + "grad_norm": 9.935173988342285, + "learning_rate": 1.3115919840464308e-05, + "loss": 2.6284, + "step": 7119000 + }, + { + "epoch": 2.213200241852628, + "grad_norm": 9.91609001159668, + "learning_rate": 1.3113329302456193e-05, + "loss": 2.7333, + "step": 7119500 + }, + { + "epoch": 2.2133556741331155, + "grad_norm": 9.660284996032715, + "learning_rate": 1.311073876444808e-05, + "loss": 2.644, + "step": 7120000 + }, + { + "epoch": 2.2135111064136024, + "grad_norm": 8.665094375610352, + "learning_rate": 1.3108148226439964e-05, + "loss": 2.6523, + "step": 7120500 + }, + { + "epoch": 2.2136665386940892, + "grad_norm": 9.326739311218262, + "learning_rate": 1.310555768843185e-05, + "loss": 2.6792, + "step": 7121000 + }, + { + "epoch": 2.213821970974576, + "grad_norm": 8.693669319152832, + "learning_rate": 1.3102967150423734e-05, + "loss": 2.6573, + "step": 7121500 + }, + { + "epoch": 2.213977403255063, + "grad_norm": 11.578669548034668, + "learning_rate": 1.310037661241562e-05, + "loss": 2.6494, + "step": 7122000 + }, + { + "epoch": 2.21413283553555, + "grad_norm": 9.38311767578125, + "learning_rate": 1.3097786074407506e-05, + "loss": 2.5886, + "step": 7122500 + }, + { + "epoch": 2.2142882678160367, + "grad_norm": 12.86021900177002, + "learning_rate": 1.309519553639939e-05, + "loss": 2.6698, + "step": 7123000 + }, + { + "epoch": 2.2144437000965236, + "grad_norm": 17.046445846557617, + "learning_rate": 1.3092604998391275e-05, + "loss": 2.6864, + "step": 7123500 + }, + { + "epoch": 2.2145991323770104, + "grad_norm": 9.570554733276367, + "learning_rate": 1.3090014460383163e-05, + "loss": 2.667, + "step": 7124000 + }, + { + "epoch": 2.2147545646574973, + "grad_norm": 10.712854385375977, + "learning_rate": 1.3087423922375048e-05, + "loss": 2.6585, + "step": 7124500 + }, + { + "epoch": 2.214909996937984, + "grad_norm": 10.663108825683594, + "learning_rate": 1.3084833384366932e-05, + "loss": 2.6357, + "step": 7125000 + }, + { + "epoch": 2.215065429218471, + "grad_norm": 12.18054485321045, + "learning_rate": 1.3082242846358819e-05, + "loss": 2.6999, + "step": 7125500 + }, + { + "epoch": 2.215220861498958, + "grad_norm": 9.276810646057129, + "learning_rate": 1.3079652308350704e-05, + "loss": 2.6192, + "step": 7126000 + }, + { + "epoch": 2.215376293779445, + "grad_norm": 10.806123733520508, + "learning_rate": 1.3077061770342588e-05, + "loss": 2.6392, + "step": 7126500 + }, + { + "epoch": 2.2155317260599316, + "grad_norm": 8.148138999938965, + "learning_rate": 1.3074471232334474e-05, + "loss": 2.652, + "step": 7127000 + }, + { + "epoch": 2.2156871583404185, + "grad_norm": 9.309374809265137, + "learning_rate": 1.307188069432636e-05, + "loss": 2.7535, + "step": 7127500 + }, + { + "epoch": 2.2158425906209054, + "grad_norm": 9.979364395141602, + "learning_rate": 1.3069290156318245e-05, + "loss": 2.6204, + "step": 7128000 + }, + { + "epoch": 2.2159980229013923, + "grad_norm": 12.960386276245117, + "learning_rate": 1.306669961831013e-05, + "loss": 2.6688, + "step": 7128500 + }, + { + "epoch": 2.216153455181879, + "grad_norm": 9.291023254394531, + "learning_rate": 1.3064109080302017e-05, + "loss": 2.6528, + "step": 7129000 + }, + { + "epoch": 2.216308887462366, + "grad_norm": 12.28662109375, + "learning_rate": 1.3061518542293903e-05, + "loss": 2.6297, + "step": 7129500 + }, + { + "epoch": 2.216464319742853, + "grad_norm": 8.140694618225098, + "learning_rate": 1.3058928004285786e-05, + "loss": 2.6548, + "step": 7130000 + }, + { + "epoch": 2.2166197520233397, + "grad_norm": 17.726356506347656, + "learning_rate": 1.3056337466277672e-05, + "loss": 2.6736, + "step": 7130500 + }, + { + "epoch": 2.2167751843038266, + "grad_norm": 12.982783317565918, + "learning_rate": 1.3053746928269559e-05, + "loss": 2.636, + "step": 7131000 + }, + { + "epoch": 2.2169306165843135, + "grad_norm": 8.953930854797363, + "learning_rate": 1.3051156390261443e-05, + "loss": 2.6518, + "step": 7131500 + }, + { + "epoch": 2.2170860488648003, + "grad_norm": 8.488070487976074, + "learning_rate": 1.3048565852253328e-05, + "loss": 2.6618, + "step": 7132000 + }, + { + "epoch": 2.217241481145287, + "grad_norm": 7.835366725921631, + "learning_rate": 1.3045975314245212e-05, + "loss": 2.6345, + "step": 7132500 + }, + { + "epoch": 2.217396913425774, + "grad_norm": 10.17237663269043, + "learning_rate": 1.3043384776237099e-05, + "loss": 2.6598, + "step": 7133000 + }, + { + "epoch": 2.217552345706261, + "grad_norm": 7.021352291107178, + "learning_rate": 1.3040794238228985e-05, + "loss": 2.6823, + "step": 7133500 + }, + { + "epoch": 2.217707777986748, + "grad_norm": 10.530366897583008, + "learning_rate": 1.3038203700220868e-05, + "loss": 2.6217, + "step": 7134000 + }, + { + "epoch": 2.2178632102672347, + "grad_norm": 7.812190532684326, + "learning_rate": 1.3035613162212757e-05, + "loss": 2.6667, + "step": 7134500 + }, + { + "epoch": 2.2180186425477215, + "grad_norm": 9.01450252532959, + "learning_rate": 1.3033022624204641e-05, + "loss": 2.6555, + "step": 7135000 + }, + { + "epoch": 2.2181740748282084, + "grad_norm": 6.617610454559326, + "learning_rate": 1.3030432086196526e-05, + "loss": 2.6567, + "step": 7135500 + }, + { + "epoch": 2.2183295071086953, + "grad_norm": 11.706563949584961, + "learning_rate": 1.302784154818841e-05, + "loss": 2.6689, + "step": 7136000 + }, + { + "epoch": 2.218484939389182, + "grad_norm": 20.004196166992188, + "learning_rate": 1.3025251010180297e-05, + "loss": 2.6648, + "step": 7136500 + }, + { + "epoch": 2.218640371669669, + "grad_norm": 11.224699974060059, + "learning_rate": 1.3022660472172183e-05, + "loss": 2.6474, + "step": 7137000 + }, + { + "epoch": 2.218795803950156, + "grad_norm": 9.189107894897461, + "learning_rate": 1.3020069934164067e-05, + "loss": 2.6435, + "step": 7137500 + }, + { + "epoch": 2.2189512362306427, + "grad_norm": 10.433833122253418, + "learning_rate": 1.3017479396155954e-05, + "loss": 2.6412, + "step": 7138000 + }, + { + "epoch": 2.2191066685111296, + "grad_norm": 7.869171142578125, + "learning_rate": 1.3014888858147839e-05, + "loss": 2.6919, + "step": 7138500 + }, + { + "epoch": 2.2192621007916165, + "grad_norm": 9.847500801086426, + "learning_rate": 1.3012298320139723e-05, + "loss": 2.6603, + "step": 7139000 + }, + { + "epoch": 2.2194175330721033, + "grad_norm": 11.58944320678711, + "learning_rate": 1.3009707782131608e-05, + "loss": 2.6372, + "step": 7139500 + }, + { + "epoch": 2.21957296535259, + "grad_norm": 10.87545108795166, + "learning_rate": 1.3007117244123496e-05, + "loss": 2.666, + "step": 7140000 + }, + { + "epoch": 2.219728397633077, + "grad_norm": 9.261749267578125, + "learning_rate": 1.3004526706115381e-05, + "loss": 2.6944, + "step": 7140500 + }, + { + "epoch": 2.219883829913564, + "grad_norm": 13.564358711242676, + "learning_rate": 1.3001936168107265e-05, + "loss": 2.6522, + "step": 7141000 + }, + { + "epoch": 2.220039262194051, + "grad_norm": 10.729466438293457, + "learning_rate": 1.299934563009915e-05, + "loss": 2.6276, + "step": 7141500 + }, + { + "epoch": 2.2201946944745377, + "grad_norm": 9.288251876831055, + "learning_rate": 1.2996755092091037e-05, + "loss": 2.7065, + "step": 7142000 + }, + { + "epoch": 2.220350126755025, + "grad_norm": 10.90540599822998, + "learning_rate": 1.2994164554082921e-05, + "loss": 2.6726, + "step": 7142500 + }, + { + "epoch": 2.2205055590355114, + "grad_norm": 10.630303382873535, + "learning_rate": 1.2991574016074807e-05, + "loss": 2.6122, + "step": 7143000 + }, + { + "epoch": 2.2206609913159987, + "grad_norm": 12.229846954345703, + "learning_rate": 1.2988983478066694e-05, + "loss": 2.6215, + "step": 7143500 + }, + { + "epoch": 2.2208164235964856, + "grad_norm": 9.266778945922852, + "learning_rate": 1.2986392940058578e-05, + "loss": 2.6355, + "step": 7144000 + }, + { + "epoch": 2.2209718558769724, + "grad_norm": 8.836146354675293, + "learning_rate": 1.2983802402050463e-05, + "loss": 2.6408, + "step": 7144500 + }, + { + "epoch": 2.2211272881574593, + "grad_norm": 9.7739896774292, + "learning_rate": 1.2981211864042347e-05, + "loss": 2.6293, + "step": 7145000 + }, + { + "epoch": 2.221282720437946, + "grad_norm": 10.288161277770996, + "learning_rate": 1.2978621326034236e-05, + "loss": 2.6507, + "step": 7145500 + }, + { + "epoch": 2.221438152718433, + "grad_norm": 11.92115592956543, + "learning_rate": 1.297603078802612e-05, + "loss": 2.6724, + "step": 7146000 + }, + { + "epoch": 2.22159358499892, + "grad_norm": 11.52691650390625, + "learning_rate": 1.2973440250018005e-05, + "loss": 2.6683, + "step": 7146500 + }, + { + "epoch": 2.2217490172794068, + "grad_norm": 10.695256233215332, + "learning_rate": 1.2970849712009892e-05, + "loss": 2.6424, + "step": 7147000 + }, + { + "epoch": 2.2219044495598936, + "grad_norm": 15.324193954467773, + "learning_rate": 1.2968259174001776e-05, + "loss": 2.6921, + "step": 7147500 + }, + { + "epoch": 2.2220598818403805, + "grad_norm": 8.937113761901855, + "learning_rate": 1.2965668635993661e-05, + "loss": 2.5997, + "step": 7148000 + }, + { + "epoch": 2.2222153141208674, + "grad_norm": 15.455540657043457, + "learning_rate": 1.2963078097985545e-05, + "loss": 2.6573, + "step": 7148500 + }, + { + "epoch": 2.2223707464013542, + "grad_norm": 8.310166358947754, + "learning_rate": 1.2960487559977432e-05, + "loss": 2.646, + "step": 7149000 + }, + { + "epoch": 2.222526178681841, + "grad_norm": 10.258296012878418, + "learning_rate": 1.2957897021969318e-05, + "loss": 2.656, + "step": 7149500 + }, + { + "epoch": 2.222681610962328, + "grad_norm": 9.709478378295898, + "learning_rate": 1.2955306483961201e-05, + "loss": 2.7266, + "step": 7150000 + }, + { + "epoch": 2.222837043242815, + "grad_norm": 10.528483390808105, + "learning_rate": 1.2952715945953087e-05, + "loss": 2.6753, + "step": 7150500 + }, + { + "epoch": 2.2229924755233017, + "grad_norm": 10.455787658691406, + "learning_rate": 1.2950125407944974e-05, + "loss": 2.655, + "step": 7151000 + }, + { + "epoch": 2.2231479078037886, + "grad_norm": 9.097046852111816, + "learning_rate": 1.294753486993686e-05, + "loss": 2.6699, + "step": 7151500 + }, + { + "epoch": 2.2233033400842754, + "grad_norm": 7.679221153259277, + "learning_rate": 1.2944944331928743e-05, + "loss": 2.6232, + "step": 7152000 + }, + { + "epoch": 2.2234587723647623, + "grad_norm": 12.452493667602539, + "learning_rate": 1.294235379392063e-05, + "loss": 2.6672, + "step": 7152500 + }, + { + "epoch": 2.223614204645249, + "grad_norm": 10.434927940368652, + "learning_rate": 1.2939763255912516e-05, + "loss": 2.6237, + "step": 7153000 + }, + { + "epoch": 2.223769636925736, + "grad_norm": 10.352173805236816, + "learning_rate": 1.29371727179044e-05, + "loss": 2.6851, + "step": 7153500 + }, + { + "epoch": 2.223925069206223, + "grad_norm": 11.149879455566406, + "learning_rate": 1.2934582179896285e-05, + "loss": 2.6787, + "step": 7154000 + }, + { + "epoch": 2.2240805014867098, + "grad_norm": 9.924142837524414, + "learning_rate": 1.2931991641888172e-05, + "loss": 2.6448, + "step": 7154500 + }, + { + "epoch": 2.2242359337671966, + "grad_norm": 10.163922309875488, + "learning_rate": 1.2929401103880056e-05, + "loss": 2.6316, + "step": 7155000 + }, + { + "epoch": 2.2243913660476835, + "grad_norm": 10.644573211669922, + "learning_rate": 1.2926810565871941e-05, + "loss": 2.667, + "step": 7155500 + }, + { + "epoch": 2.2245467983281704, + "grad_norm": 9.05226993560791, + "learning_rate": 1.2924220027863829e-05, + "loss": 2.634, + "step": 7156000 + }, + { + "epoch": 2.2247022306086572, + "grad_norm": 9.464699745178223, + "learning_rate": 1.2921629489855714e-05, + "loss": 2.6495, + "step": 7156500 + }, + { + "epoch": 2.224857662889144, + "grad_norm": 46.235286712646484, + "learning_rate": 1.2919038951847598e-05, + "loss": 2.6617, + "step": 7157000 + }, + { + "epoch": 2.225013095169631, + "grad_norm": 11.55652141571045, + "learning_rate": 1.2916448413839483e-05, + "loss": 2.6079, + "step": 7157500 + }, + { + "epoch": 2.225168527450118, + "grad_norm": 8.47425365447998, + "learning_rate": 1.291385787583137e-05, + "loss": 2.66, + "step": 7158000 + }, + { + "epoch": 2.2253239597306047, + "grad_norm": 19.65086555480957, + "learning_rate": 1.2911267337823254e-05, + "loss": 2.6924, + "step": 7158500 + }, + { + "epoch": 2.2254793920110916, + "grad_norm": 9.128742218017578, + "learning_rate": 1.290867679981514e-05, + "loss": 2.702, + "step": 7159000 + }, + { + "epoch": 2.2256348242915784, + "grad_norm": 9.44648265838623, + "learning_rate": 1.2906086261807023e-05, + "loss": 2.6818, + "step": 7159500 + }, + { + "epoch": 2.2257902565720653, + "grad_norm": 8.344047546386719, + "learning_rate": 1.290349572379891e-05, + "loss": 2.6545, + "step": 7160000 + }, + { + "epoch": 2.225945688852552, + "grad_norm": 9.283963203430176, + "learning_rate": 1.2900905185790796e-05, + "loss": 2.6923, + "step": 7160500 + }, + { + "epoch": 2.226101121133039, + "grad_norm": 14.37148380279541, + "learning_rate": 1.289831464778268e-05, + "loss": 2.6765, + "step": 7161000 + }, + { + "epoch": 2.226256553413526, + "grad_norm": 7.734679698944092, + "learning_rate": 1.2895724109774569e-05, + "loss": 2.6171, + "step": 7161500 + }, + { + "epoch": 2.2264119856940128, + "grad_norm": 12.772677421569824, + "learning_rate": 1.2893133571766452e-05, + "loss": 2.6553, + "step": 7162000 + }, + { + "epoch": 2.2265674179744996, + "grad_norm": 11.300198554992676, + "learning_rate": 1.2890543033758338e-05, + "loss": 2.6484, + "step": 7162500 + }, + { + "epoch": 2.2267228502549865, + "grad_norm": 9.202285766601562, + "learning_rate": 1.2887952495750222e-05, + "loss": 2.6639, + "step": 7163000 + }, + { + "epoch": 2.2268782825354734, + "grad_norm": 11.302162170410156, + "learning_rate": 1.2885361957742109e-05, + "loss": 2.6529, + "step": 7163500 + }, + { + "epoch": 2.2270337148159602, + "grad_norm": 10.159517288208008, + "learning_rate": 1.2882771419733994e-05, + "loss": 2.6437, + "step": 7164000 + }, + { + "epoch": 2.227189147096447, + "grad_norm": 9.448019027709961, + "learning_rate": 1.2880180881725878e-05, + "loss": 2.629, + "step": 7164500 + }, + { + "epoch": 2.227344579376934, + "grad_norm": 12.667049407958984, + "learning_rate": 1.2877590343717765e-05, + "loss": 2.6452, + "step": 7165000 + }, + { + "epoch": 2.227500011657421, + "grad_norm": 9.971600532531738, + "learning_rate": 1.287499980570965e-05, + "loss": 2.6532, + "step": 7165500 + }, + { + "epoch": 2.2276554439379077, + "grad_norm": 9.40889835357666, + "learning_rate": 1.2872409267701534e-05, + "loss": 2.6554, + "step": 7166000 + }, + { + "epoch": 2.2278108762183946, + "grad_norm": 17.27740478515625, + "learning_rate": 1.286981872969342e-05, + "loss": 2.6482, + "step": 7166500 + }, + { + "epoch": 2.227966308498882, + "grad_norm": 11.483429908752441, + "learning_rate": 1.2867228191685307e-05, + "loss": 2.6221, + "step": 7167000 + }, + { + "epoch": 2.2281217407793683, + "grad_norm": 24.90559959411621, + "learning_rate": 1.2864637653677192e-05, + "loss": 2.6844, + "step": 7167500 + }, + { + "epoch": 2.2282771730598556, + "grad_norm": 10.634309768676758, + "learning_rate": 1.2862047115669076e-05, + "loss": 2.6215, + "step": 7168000 + }, + { + "epoch": 2.2284326053403425, + "grad_norm": 10.079927444458008, + "learning_rate": 1.2859456577660962e-05, + "loss": 2.6186, + "step": 7168500 + }, + { + "epoch": 2.2285880376208294, + "grad_norm": 12.806611061096191, + "learning_rate": 1.2856866039652849e-05, + "loss": 2.6669, + "step": 7169000 + }, + { + "epoch": 2.228743469901316, + "grad_norm": 9.5142183303833, + "learning_rate": 1.2854275501644733e-05, + "loss": 2.6414, + "step": 7169500 + }, + { + "epoch": 2.228898902181803, + "grad_norm": 9.18840217590332, + "learning_rate": 1.2851684963636618e-05, + "loss": 2.6699, + "step": 7170000 + }, + { + "epoch": 2.22905433446229, + "grad_norm": 10.883435249328613, + "learning_rate": 1.2849094425628505e-05, + "loss": 2.677, + "step": 7170500 + }, + { + "epoch": 2.229209766742777, + "grad_norm": 10.806145668029785, + "learning_rate": 1.2846503887620389e-05, + "loss": 2.6586, + "step": 7171000 + }, + { + "epoch": 2.2293651990232637, + "grad_norm": 11.01914119720459, + "learning_rate": 1.2843913349612274e-05, + "loss": 2.666, + "step": 7171500 + }, + { + "epoch": 2.2295206313037506, + "grad_norm": 9.429760932922363, + "learning_rate": 1.2841322811604158e-05, + "loss": 2.6409, + "step": 7172000 + }, + { + "epoch": 2.2296760635842374, + "grad_norm": 18.816543579101562, + "learning_rate": 1.2838732273596047e-05, + "loss": 2.6277, + "step": 7172500 + }, + { + "epoch": 2.2298314958647243, + "grad_norm": 8.354694366455078, + "learning_rate": 1.283614173558793e-05, + "loss": 2.6869, + "step": 7173000 + }, + { + "epoch": 2.229986928145211, + "grad_norm": 13.505043029785156, + "learning_rate": 1.2833551197579816e-05, + "loss": 2.6735, + "step": 7173500 + }, + { + "epoch": 2.230142360425698, + "grad_norm": 15.622140884399414, + "learning_rate": 1.2830960659571703e-05, + "loss": 2.6698, + "step": 7174000 + }, + { + "epoch": 2.230297792706185, + "grad_norm": 10.812239646911621, + "learning_rate": 1.2828370121563587e-05, + "loss": 2.6526, + "step": 7174500 + }, + { + "epoch": 2.2304532249866718, + "grad_norm": 10.366941452026367, + "learning_rate": 1.2825779583555473e-05, + "loss": 2.6412, + "step": 7175000 + }, + { + "epoch": 2.2306086572671586, + "grad_norm": 6.997445106506348, + "learning_rate": 1.2823189045547356e-05, + "loss": 2.6615, + "step": 7175500 + }, + { + "epoch": 2.2307640895476455, + "grad_norm": 8.228425025939941, + "learning_rate": 1.2820598507539244e-05, + "loss": 2.6963, + "step": 7176000 + }, + { + "epoch": 2.2309195218281324, + "grad_norm": 11.0784273147583, + "learning_rate": 1.2818007969531129e-05, + "loss": 2.65, + "step": 7176500 + }, + { + "epoch": 2.2310749541086192, + "grad_norm": 16.06426239013672, + "learning_rate": 1.2815417431523013e-05, + "loss": 2.6544, + "step": 7177000 + }, + { + "epoch": 2.231230386389106, + "grad_norm": 9.586087226867676, + "learning_rate": 1.2812826893514902e-05, + "loss": 2.6722, + "step": 7177500 + }, + { + "epoch": 2.231385818669593, + "grad_norm": 9.519526481628418, + "learning_rate": 1.2810236355506785e-05, + "loss": 2.6244, + "step": 7178000 + }, + { + "epoch": 2.23154125095008, + "grad_norm": 9.62807559967041, + "learning_rate": 1.280764581749867e-05, + "loss": 2.6507, + "step": 7178500 + }, + { + "epoch": 2.2316966832305667, + "grad_norm": 9.213805198669434, + "learning_rate": 1.2805055279490555e-05, + "loss": 2.6322, + "step": 7179000 + }, + { + "epoch": 2.2318521155110536, + "grad_norm": 11.591078758239746, + "learning_rate": 1.2802464741482442e-05, + "loss": 2.643, + "step": 7179500 + }, + { + "epoch": 2.2320075477915404, + "grad_norm": 10.443679809570312, + "learning_rate": 1.2799874203474327e-05, + "loss": 2.6878, + "step": 7180000 + }, + { + "epoch": 2.2321629800720273, + "grad_norm": 17.10089874267578, + "learning_rate": 1.2797283665466211e-05, + "loss": 2.6566, + "step": 7180500 + }, + { + "epoch": 2.232318412352514, + "grad_norm": 10.009574890136719, + "learning_rate": 1.2794693127458096e-05, + "loss": 2.7016, + "step": 7181000 + }, + { + "epoch": 2.232473844633001, + "grad_norm": 8.21452522277832, + "learning_rate": 1.2792102589449984e-05, + "loss": 2.6442, + "step": 7181500 + }, + { + "epoch": 2.232629276913488, + "grad_norm": 9.569382667541504, + "learning_rate": 1.2789512051441867e-05, + "loss": 2.6348, + "step": 7182000 + }, + { + "epoch": 2.2327847091939748, + "grad_norm": 9.118851661682129, + "learning_rate": 1.2786921513433753e-05, + "loss": 2.6308, + "step": 7182500 + }, + { + "epoch": 2.2329401414744616, + "grad_norm": 16.5813045501709, + "learning_rate": 1.278433097542564e-05, + "loss": 2.6593, + "step": 7183000 + }, + { + "epoch": 2.2330955737549485, + "grad_norm": 22.962129592895508, + "learning_rate": 1.2781740437417525e-05, + "loss": 2.6297, + "step": 7183500 + }, + { + "epoch": 2.2332510060354354, + "grad_norm": 12.030624389648438, + "learning_rate": 1.277914989940941e-05, + "loss": 2.6029, + "step": 7184000 + }, + { + "epoch": 2.2334064383159222, + "grad_norm": 9.209035873413086, + "learning_rate": 1.2776559361401295e-05, + "loss": 2.6705, + "step": 7184500 + }, + { + "epoch": 2.233561870596409, + "grad_norm": 12.802806854248047, + "learning_rate": 1.2773968823393182e-05, + "loss": 2.6387, + "step": 7185000 + }, + { + "epoch": 2.233717302876896, + "grad_norm": 10.375481605529785, + "learning_rate": 1.2771378285385066e-05, + "loss": 2.6595, + "step": 7185500 + }, + { + "epoch": 2.233872735157383, + "grad_norm": 9.094239234924316, + "learning_rate": 1.2768787747376951e-05, + "loss": 2.6089, + "step": 7186000 + }, + { + "epoch": 2.2340281674378697, + "grad_norm": 11.297277450561523, + "learning_rate": 1.2766197209368838e-05, + "loss": 2.6427, + "step": 7186500 + }, + { + "epoch": 2.2341835997183566, + "grad_norm": 15.252893447875977, + "learning_rate": 1.2763606671360722e-05, + "loss": 2.656, + "step": 7187000 + }, + { + "epoch": 2.2343390319988434, + "grad_norm": 14.841137886047363, + "learning_rate": 1.2761016133352607e-05, + "loss": 2.6283, + "step": 7187500 + }, + { + "epoch": 2.2344944642793303, + "grad_norm": 9.856842041015625, + "learning_rate": 1.2758425595344493e-05, + "loss": 2.635, + "step": 7188000 + }, + { + "epoch": 2.234649896559817, + "grad_norm": 9.3853178024292, + "learning_rate": 1.275583505733638e-05, + "loss": 2.6888, + "step": 7188500 + }, + { + "epoch": 2.234805328840304, + "grad_norm": 7.9788713455200195, + "learning_rate": 1.2753244519328264e-05, + "loss": 2.596, + "step": 7189000 + }, + { + "epoch": 2.234960761120791, + "grad_norm": 7.998826026916504, + "learning_rate": 1.275065398132015e-05, + "loss": 2.6739, + "step": 7189500 + }, + { + "epoch": 2.2351161934012778, + "grad_norm": 11.361433029174805, + "learning_rate": 1.2748063443312033e-05, + "loss": 2.6819, + "step": 7190000 + }, + { + "epoch": 2.235271625681765, + "grad_norm": 10.122767448425293, + "learning_rate": 1.274547290530392e-05, + "loss": 2.6699, + "step": 7190500 + }, + { + "epoch": 2.2354270579622515, + "grad_norm": 10.342338562011719, + "learning_rate": 1.2742882367295806e-05, + "loss": 2.5796, + "step": 7191000 + }, + { + "epoch": 2.235582490242739, + "grad_norm": 10.849934577941895, + "learning_rate": 1.274029182928769e-05, + "loss": 2.6251, + "step": 7191500 + }, + { + "epoch": 2.2357379225232257, + "grad_norm": 10.286959648132324, + "learning_rate": 1.2737701291279577e-05, + "loss": 2.6477, + "step": 7192000 + }, + { + "epoch": 2.2358933548037125, + "grad_norm": 36.093544006347656, + "learning_rate": 1.2735110753271462e-05, + "loss": 2.6273, + "step": 7192500 + }, + { + "epoch": 2.2360487870841994, + "grad_norm": 9.5552396774292, + "learning_rate": 1.2732520215263347e-05, + "loss": 2.6774, + "step": 7193000 + }, + { + "epoch": 2.2362042193646863, + "grad_norm": 10.399503707885742, + "learning_rate": 1.2729929677255231e-05, + "loss": 2.6385, + "step": 7193500 + }, + { + "epoch": 2.236359651645173, + "grad_norm": 14.685890197753906, + "learning_rate": 1.2727339139247118e-05, + "loss": 2.6929, + "step": 7194000 + }, + { + "epoch": 2.23651508392566, + "grad_norm": 8.635143280029297, + "learning_rate": 1.2724748601239004e-05, + "loss": 2.6524, + "step": 7194500 + }, + { + "epoch": 2.236670516206147, + "grad_norm": 56.380775451660156, + "learning_rate": 1.2722158063230888e-05, + "loss": 2.6956, + "step": 7195000 + }, + { + "epoch": 2.2368259484866337, + "grad_norm": 13.908403396606445, + "learning_rate": 1.2719567525222775e-05, + "loss": 2.6358, + "step": 7195500 + }, + { + "epoch": 2.2369813807671206, + "grad_norm": 19.616065979003906, + "learning_rate": 1.271697698721466e-05, + "loss": 2.6442, + "step": 7196000 + }, + { + "epoch": 2.2371368130476075, + "grad_norm": 69.05726623535156, + "learning_rate": 1.2714386449206544e-05, + "loss": 2.665, + "step": 7196500 + }, + { + "epoch": 2.2372922453280943, + "grad_norm": 11.406923294067383, + "learning_rate": 1.271179591119843e-05, + "loss": 2.6628, + "step": 7197000 + }, + { + "epoch": 2.237447677608581, + "grad_norm": 9.404034614562988, + "learning_rate": 1.2709205373190317e-05, + "loss": 2.7005, + "step": 7197500 + }, + { + "epoch": 2.237603109889068, + "grad_norm": 39.49403762817383, + "learning_rate": 1.2706614835182202e-05, + "loss": 2.6522, + "step": 7198000 + }, + { + "epoch": 2.237758542169555, + "grad_norm": 10.313891410827637, + "learning_rate": 1.2704024297174086e-05, + "loss": 2.6368, + "step": 7198500 + }, + { + "epoch": 2.237913974450042, + "grad_norm": 7.876594066619873, + "learning_rate": 1.2701433759165971e-05, + "loss": 2.6322, + "step": 7199000 + }, + { + "epoch": 2.2380694067305287, + "grad_norm": 11.297225952148438, + "learning_rate": 1.2698843221157858e-05, + "loss": 2.6665, + "step": 7199500 + }, + { + "epoch": 2.2382248390110155, + "grad_norm": 10.47504997253418, + "learning_rate": 1.2696252683149742e-05, + "loss": 2.6763, + "step": 7200000 + }, + { + "epoch": 2.2383802712915024, + "grad_norm": 8.812747955322266, + "learning_rate": 1.2693662145141628e-05, + "loss": 2.6321, + "step": 7200500 + }, + { + "epoch": 2.2385357035719893, + "grad_norm": 10.093758583068848, + "learning_rate": 1.2691071607133515e-05, + "loss": 2.7013, + "step": 7201000 + }, + { + "epoch": 2.238691135852476, + "grad_norm": 11.9350004196167, + "learning_rate": 1.2688481069125399e-05, + "loss": 2.6671, + "step": 7201500 + }, + { + "epoch": 2.238846568132963, + "grad_norm": 10.542673110961914, + "learning_rate": 1.2685890531117284e-05, + "loss": 2.6422, + "step": 7202000 + }, + { + "epoch": 2.23900200041345, + "grad_norm": 8.861295700073242, + "learning_rate": 1.2683299993109168e-05, + "loss": 2.71, + "step": 7202500 + }, + { + "epoch": 2.2391574326939367, + "grad_norm": 10.887615203857422, + "learning_rate": 1.2680709455101057e-05, + "loss": 2.6375, + "step": 7203000 + }, + { + "epoch": 2.2393128649744236, + "grad_norm": 13.036991119384766, + "learning_rate": 1.267811891709294e-05, + "loss": 2.6846, + "step": 7203500 + }, + { + "epoch": 2.2394682972549105, + "grad_norm": 18.5168399810791, + "learning_rate": 1.2675528379084826e-05, + "loss": 2.6258, + "step": 7204000 + }, + { + "epoch": 2.2396237295353973, + "grad_norm": 14.538248062133789, + "learning_rate": 1.2672937841076713e-05, + "loss": 2.6145, + "step": 7204500 + }, + { + "epoch": 2.239779161815884, + "grad_norm": 9.057202339172363, + "learning_rate": 1.2670347303068597e-05, + "loss": 2.6259, + "step": 7205000 + }, + { + "epoch": 2.239934594096371, + "grad_norm": 10.498772621154785, + "learning_rate": 1.2667756765060482e-05, + "loss": 2.5874, + "step": 7205500 + }, + { + "epoch": 2.240090026376858, + "grad_norm": 9.809602737426758, + "learning_rate": 1.2665166227052366e-05, + "loss": 2.6741, + "step": 7206000 + }, + { + "epoch": 2.240245458657345, + "grad_norm": 15.741018295288086, + "learning_rate": 1.2662575689044253e-05, + "loss": 2.6533, + "step": 7206500 + }, + { + "epoch": 2.2404008909378317, + "grad_norm": 8.761889457702637, + "learning_rate": 1.2659985151036139e-05, + "loss": 2.7106, + "step": 7207000 + }, + { + "epoch": 2.2405563232183185, + "grad_norm": 8.838066101074219, + "learning_rate": 1.2657394613028022e-05, + "loss": 2.6897, + "step": 7207500 + }, + { + "epoch": 2.2407117554988054, + "grad_norm": 11.220231056213379, + "learning_rate": 1.2654804075019908e-05, + "loss": 2.6537, + "step": 7208000 + }, + { + "epoch": 2.2408671877792923, + "grad_norm": 11.839509963989258, + "learning_rate": 1.2652213537011795e-05, + "loss": 2.6599, + "step": 7208500 + }, + { + "epoch": 2.241022620059779, + "grad_norm": 9.294039726257324, + "learning_rate": 1.264962299900368e-05, + "loss": 2.6461, + "step": 7209000 + }, + { + "epoch": 2.241178052340266, + "grad_norm": 8.260056495666504, + "learning_rate": 1.2647032460995564e-05, + "loss": 2.6601, + "step": 7209500 + }, + { + "epoch": 2.241333484620753, + "grad_norm": 10.154735565185547, + "learning_rate": 1.2644441922987451e-05, + "loss": 2.6038, + "step": 7210000 + }, + { + "epoch": 2.2414889169012397, + "grad_norm": 10.451393127441406, + "learning_rate": 1.2641851384979337e-05, + "loss": 2.6255, + "step": 7210500 + }, + { + "epoch": 2.2416443491817266, + "grad_norm": 21.59113121032715, + "learning_rate": 1.263926084697122e-05, + "loss": 2.6141, + "step": 7211000 + }, + { + "epoch": 2.2417997814622135, + "grad_norm": 10.300459861755371, + "learning_rate": 1.2636670308963106e-05, + "loss": 2.6773, + "step": 7211500 + }, + { + "epoch": 2.2419552137427003, + "grad_norm": 8.887249946594238, + "learning_rate": 1.2634079770954993e-05, + "loss": 2.697, + "step": 7212000 + }, + { + "epoch": 2.242110646023187, + "grad_norm": 34.83821105957031, + "learning_rate": 1.2631489232946877e-05, + "loss": 2.6005, + "step": 7212500 + }, + { + "epoch": 2.242266078303674, + "grad_norm": 8.606853485107422, + "learning_rate": 1.2628898694938762e-05, + "loss": 2.6689, + "step": 7213000 + }, + { + "epoch": 2.242421510584161, + "grad_norm": 16.4934139251709, + "learning_rate": 1.262630815693065e-05, + "loss": 2.64, + "step": 7213500 + }, + { + "epoch": 2.242576942864648, + "grad_norm": 8.08721923828125, + "learning_rate": 1.2623717618922535e-05, + "loss": 2.6574, + "step": 7214000 + }, + { + "epoch": 2.2427323751451347, + "grad_norm": 9.566269874572754, + "learning_rate": 1.2621127080914419e-05, + "loss": 2.6806, + "step": 7214500 + }, + { + "epoch": 2.242887807425622, + "grad_norm": 7.981472015380859, + "learning_rate": 1.2618536542906304e-05, + "loss": 2.6656, + "step": 7215000 + }, + { + "epoch": 2.2430432397061084, + "grad_norm": 11.261706352233887, + "learning_rate": 1.2615946004898191e-05, + "loss": 2.6418, + "step": 7215500 + }, + { + "epoch": 2.2431986719865957, + "grad_norm": 11.334659576416016, + "learning_rate": 1.2613355466890075e-05, + "loss": 2.6908, + "step": 7216000 + }, + { + "epoch": 2.2433541042670826, + "grad_norm": 14.808954238891602, + "learning_rate": 1.261076492888196e-05, + "loss": 2.6426, + "step": 7216500 + }, + { + "epoch": 2.2435095365475695, + "grad_norm": 14.410493850708008, + "learning_rate": 1.2608174390873844e-05, + "loss": 2.6865, + "step": 7217000 + }, + { + "epoch": 2.2436649688280563, + "grad_norm": 30.895275115966797, + "learning_rate": 1.2605583852865732e-05, + "loss": 2.6493, + "step": 7217500 + }, + { + "epoch": 2.243820401108543, + "grad_norm": 7.907965660095215, + "learning_rate": 1.2602993314857617e-05, + "loss": 2.6854, + "step": 7218000 + }, + { + "epoch": 2.24397583338903, + "grad_norm": 15.717367172241211, + "learning_rate": 1.26004027768495e-05, + "loss": 2.6364, + "step": 7218500 + }, + { + "epoch": 2.244131265669517, + "grad_norm": 9.410490989685059, + "learning_rate": 1.259781223884139e-05, + "loss": 2.6132, + "step": 7219000 + }, + { + "epoch": 2.244286697950004, + "grad_norm": 8.156184196472168, + "learning_rate": 1.2595221700833273e-05, + "loss": 2.6295, + "step": 7219500 + }, + { + "epoch": 2.2444421302304907, + "grad_norm": 10.072297096252441, + "learning_rate": 1.2592631162825159e-05, + "loss": 2.6255, + "step": 7220000 + }, + { + "epoch": 2.2445975625109775, + "grad_norm": 10.421809196472168, + "learning_rate": 1.2590040624817043e-05, + "loss": 2.6936, + "step": 7220500 + }, + { + "epoch": 2.2447529947914644, + "grad_norm": 17.344606399536133, + "learning_rate": 1.258745008680893e-05, + "loss": 2.6317, + "step": 7221000 + }, + { + "epoch": 2.2449084270719513, + "grad_norm": 9.468056678771973, + "learning_rate": 1.2584859548800815e-05, + "loss": 2.6341, + "step": 7221500 + }, + { + "epoch": 2.245063859352438, + "grad_norm": 9.940185546875, + "learning_rate": 1.2582269010792699e-05, + "loss": 2.6407, + "step": 7222000 + }, + { + "epoch": 2.245219291632925, + "grad_norm": 9.333502769470215, + "learning_rate": 1.2579678472784586e-05, + "loss": 2.6702, + "step": 7222500 + }, + { + "epoch": 2.245374723913412, + "grad_norm": 11.055627822875977, + "learning_rate": 1.2577087934776472e-05, + "loss": 2.6801, + "step": 7223000 + }, + { + "epoch": 2.2455301561938987, + "grad_norm": 8.801846504211426, + "learning_rate": 1.2574497396768355e-05, + "loss": 2.6558, + "step": 7223500 + }, + { + "epoch": 2.2456855884743856, + "grad_norm": 9.877422332763672, + "learning_rate": 1.257190685876024e-05, + "loss": 2.6788, + "step": 7224000 + }, + { + "epoch": 2.2458410207548725, + "grad_norm": 9.9110107421875, + "learning_rate": 1.2569316320752128e-05, + "loss": 2.5816, + "step": 7224500 + }, + { + "epoch": 2.2459964530353593, + "grad_norm": 8.237556457519531, + "learning_rate": 1.2566725782744013e-05, + "loss": 2.6516, + "step": 7225000 + }, + { + "epoch": 2.246151885315846, + "grad_norm": 10.900328636169434, + "learning_rate": 1.2564135244735897e-05, + "loss": 2.6253, + "step": 7225500 + }, + { + "epoch": 2.246307317596333, + "grad_norm": 8.573287963867188, + "learning_rate": 1.2561544706727783e-05, + "loss": 2.6357, + "step": 7226000 + }, + { + "epoch": 2.24646274987682, + "grad_norm": 11.293524742126465, + "learning_rate": 1.255895416871967e-05, + "loss": 2.6718, + "step": 7226500 + }, + { + "epoch": 2.246618182157307, + "grad_norm": 9.987339973449707, + "learning_rate": 1.2556363630711554e-05, + "loss": 2.6385, + "step": 7227000 + }, + { + "epoch": 2.2467736144377937, + "grad_norm": 9.759276390075684, + "learning_rate": 1.2553773092703439e-05, + "loss": 2.6771, + "step": 7227500 + }, + { + "epoch": 2.2469290467182805, + "grad_norm": 14.66057014465332, + "learning_rate": 1.2551182554695326e-05, + "loss": 2.6733, + "step": 7228000 + }, + { + "epoch": 2.2470844789987674, + "grad_norm": 10.018325805664062, + "learning_rate": 1.254859201668721e-05, + "loss": 2.611, + "step": 7228500 + }, + { + "epoch": 2.2472399112792543, + "grad_norm": 9.229485511779785, + "learning_rate": 1.2546001478679095e-05, + "loss": 2.6754, + "step": 7229000 + }, + { + "epoch": 2.247395343559741, + "grad_norm": 9.652859687805176, + "learning_rate": 1.254341094067098e-05, + "loss": 2.617, + "step": 7229500 + }, + { + "epoch": 2.247550775840228, + "grad_norm": 8.675727844238281, + "learning_rate": 1.2540820402662868e-05, + "loss": 2.6189, + "step": 7230000 + }, + { + "epoch": 2.247706208120715, + "grad_norm": 10.242180824279785, + "learning_rate": 1.2538229864654752e-05, + "loss": 2.6694, + "step": 7230500 + }, + { + "epoch": 2.2478616404012017, + "grad_norm": 23.558273315429688, + "learning_rate": 1.2535639326646637e-05, + "loss": 2.6257, + "step": 7231000 + }, + { + "epoch": 2.2480170726816886, + "grad_norm": 59.666927337646484, + "learning_rate": 1.2533048788638524e-05, + "loss": 2.6431, + "step": 7231500 + }, + { + "epoch": 2.2481725049621755, + "grad_norm": 15.315305709838867, + "learning_rate": 1.2530458250630408e-05, + "loss": 2.6842, + "step": 7232000 + }, + { + "epoch": 2.2483279372426623, + "grad_norm": 9.986248970031738, + "learning_rate": 1.2527867712622294e-05, + "loss": 2.6191, + "step": 7232500 + }, + { + "epoch": 2.248483369523149, + "grad_norm": 8.483797073364258, + "learning_rate": 1.2525277174614177e-05, + "loss": 2.6325, + "step": 7233000 + }, + { + "epoch": 2.248638801803636, + "grad_norm": 10.465794563293457, + "learning_rate": 1.2522686636606065e-05, + "loss": 2.6314, + "step": 7233500 + }, + { + "epoch": 2.248794234084123, + "grad_norm": 10.512862205505371, + "learning_rate": 1.252009609859795e-05, + "loss": 2.6644, + "step": 7234000 + }, + { + "epoch": 2.24894966636461, + "grad_norm": 22.651784896850586, + "learning_rate": 1.2517505560589834e-05, + "loss": 2.6433, + "step": 7234500 + }, + { + "epoch": 2.2491050986450967, + "grad_norm": 13.058823585510254, + "learning_rate": 1.251491502258172e-05, + "loss": 2.6081, + "step": 7235000 + }, + { + "epoch": 2.2492605309255835, + "grad_norm": 12.654336929321289, + "learning_rate": 1.2512324484573606e-05, + "loss": 2.5827, + "step": 7235500 + }, + { + "epoch": 2.2494159632060704, + "grad_norm": 20.884021759033203, + "learning_rate": 1.2509733946565492e-05, + "loss": 2.6245, + "step": 7236000 + }, + { + "epoch": 2.2495713954865573, + "grad_norm": 12.337471008300781, + "learning_rate": 1.2507143408557376e-05, + "loss": 2.6812, + "step": 7236500 + }, + { + "epoch": 2.249726827767044, + "grad_norm": 9.489177703857422, + "learning_rate": 1.2504552870549263e-05, + "loss": 2.6362, + "step": 7237000 + }, + { + "epoch": 2.249882260047531, + "grad_norm": 28.528297424316406, + "learning_rate": 1.2501962332541148e-05, + "loss": 2.6598, + "step": 7237500 + }, + { + "epoch": 2.250037692328018, + "grad_norm": 19.18927574157715, + "learning_rate": 1.2499371794533032e-05, + "loss": 2.6545, + "step": 7238000 + }, + { + "epoch": 2.250193124608505, + "grad_norm": 8.352943420410156, + "learning_rate": 1.2496781256524919e-05, + "loss": 2.6557, + "step": 7238500 + }, + { + "epoch": 2.2503485568889916, + "grad_norm": 12.294565200805664, + "learning_rate": 1.2494190718516803e-05, + "loss": 2.66, + "step": 7239000 + }, + { + "epoch": 2.250503989169479, + "grad_norm": 12.349541664123535, + "learning_rate": 1.2491600180508688e-05, + "loss": 2.6521, + "step": 7239500 + }, + { + "epoch": 2.2506594214499653, + "grad_norm": 8.674079895019531, + "learning_rate": 1.2489009642500576e-05, + "loss": 2.6503, + "step": 7240000 + }, + { + "epoch": 2.2508148537304526, + "grad_norm": 13.160181045532227, + "learning_rate": 1.248641910449246e-05, + "loss": 2.6398, + "step": 7240500 + }, + { + "epoch": 2.2509702860109395, + "grad_norm": 7.612066268920898, + "learning_rate": 1.2483828566484346e-05, + "loss": 2.6599, + "step": 7241000 + }, + { + "epoch": 2.2511257182914264, + "grad_norm": 8.652766227722168, + "learning_rate": 1.248123802847623e-05, + "loss": 2.618, + "step": 7241500 + }, + { + "epoch": 2.2512811505719132, + "grad_norm": 13.375555992126465, + "learning_rate": 1.2478647490468116e-05, + "loss": 2.6239, + "step": 7242000 + }, + { + "epoch": 2.2514365828524, + "grad_norm": 9.426644325256348, + "learning_rate": 1.2476056952460001e-05, + "loss": 2.6284, + "step": 7242500 + }, + { + "epoch": 2.251592015132887, + "grad_norm": 11.599854469299316, + "learning_rate": 1.2473466414451887e-05, + "loss": 2.636, + "step": 7243000 + }, + { + "epoch": 2.251747447413374, + "grad_norm": 9.82784366607666, + "learning_rate": 1.2470875876443772e-05, + "loss": 2.5946, + "step": 7243500 + }, + { + "epoch": 2.2519028796938607, + "grad_norm": 7.927104473114014, + "learning_rate": 1.2468285338435658e-05, + "loss": 2.6, + "step": 7244000 + }, + { + "epoch": 2.2520583119743476, + "grad_norm": 9.803215980529785, + "learning_rate": 1.2465694800427543e-05, + "loss": 2.6302, + "step": 7244500 + }, + { + "epoch": 2.2522137442548344, + "grad_norm": 8.25950813293457, + "learning_rate": 1.2463104262419428e-05, + "loss": 2.6334, + "step": 7245000 + }, + { + "epoch": 2.2523691765353213, + "grad_norm": 10.559380531311035, + "learning_rate": 1.2460513724411314e-05, + "loss": 2.6313, + "step": 7245500 + }, + { + "epoch": 2.252524608815808, + "grad_norm": 10.101042747497559, + "learning_rate": 1.24579231864032e-05, + "loss": 2.6897, + "step": 7246000 + }, + { + "epoch": 2.252680041096295, + "grad_norm": 9.948715209960938, + "learning_rate": 1.2455332648395085e-05, + "loss": 2.6684, + "step": 7246500 + }, + { + "epoch": 2.252835473376782, + "grad_norm": 9.580574989318848, + "learning_rate": 1.245274211038697e-05, + "loss": 2.6837, + "step": 7247000 + }, + { + "epoch": 2.252990905657269, + "grad_norm": 12.345799446105957, + "learning_rate": 1.2450151572378856e-05, + "loss": 2.6865, + "step": 7247500 + }, + { + "epoch": 2.2531463379377556, + "grad_norm": 15.021177291870117, + "learning_rate": 1.244756103437074e-05, + "loss": 2.6808, + "step": 7248000 + }, + { + "epoch": 2.2533017702182425, + "grad_norm": 9.907469749450684, + "learning_rate": 1.2444970496362627e-05, + "loss": 2.5983, + "step": 7248500 + }, + { + "epoch": 2.2534572024987294, + "grad_norm": 11.969019889831543, + "learning_rate": 1.2442379958354512e-05, + "loss": 2.6651, + "step": 7249000 + }, + { + "epoch": 2.2536126347792163, + "grad_norm": 9.866693496704102, + "learning_rate": 1.2439789420346398e-05, + "loss": 2.6392, + "step": 7249500 + }, + { + "epoch": 2.253768067059703, + "grad_norm": 7.025405406951904, + "learning_rate": 1.2437198882338283e-05, + "loss": 2.6262, + "step": 7250000 + }, + { + "epoch": 2.25392349934019, + "grad_norm": 10.980616569519043, + "learning_rate": 1.2434608344330167e-05, + "loss": 2.6688, + "step": 7250500 + }, + { + "epoch": 2.254078931620677, + "grad_norm": 10.115554809570312, + "learning_rate": 1.2432017806322054e-05, + "loss": 2.6862, + "step": 7251000 + }, + { + "epoch": 2.2542343639011637, + "grad_norm": 10.818291664123535, + "learning_rate": 1.2429427268313938e-05, + "loss": 2.634, + "step": 7251500 + }, + { + "epoch": 2.2543897961816506, + "grad_norm": 8.706925392150879, + "learning_rate": 1.2426836730305825e-05, + "loss": 2.6447, + "step": 7252000 + }, + { + "epoch": 2.2545452284621375, + "grad_norm": 9.9680814743042, + "learning_rate": 1.2424246192297709e-05, + "loss": 2.6517, + "step": 7252500 + }, + { + "epoch": 2.2547006607426243, + "grad_norm": 10.195796012878418, + "learning_rate": 1.2421655654289594e-05, + "loss": 2.6612, + "step": 7253000 + }, + { + "epoch": 2.254856093023111, + "grad_norm": 9.162461280822754, + "learning_rate": 1.2419065116281481e-05, + "loss": 2.6384, + "step": 7253500 + }, + { + "epoch": 2.255011525303598, + "grad_norm": 10.339829444885254, + "learning_rate": 1.2416474578273365e-05, + "loss": 2.608, + "step": 7254000 + }, + { + "epoch": 2.255166957584085, + "grad_norm": 9.349732398986816, + "learning_rate": 1.2413884040265252e-05, + "loss": 2.6678, + "step": 7254500 + }, + { + "epoch": 2.255322389864572, + "grad_norm": 9.070911407470703, + "learning_rate": 1.2411293502257136e-05, + "loss": 2.6093, + "step": 7255000 + }, + { + "epoch": 2.2554778221450587, + "grad_norm": 10.471542358398438, + "learning_rate": 1.2408702964249021e-05, + "loss": 2.6135, + "step": 7255500 + }, + { + "epoch": 2.2556332544255455, + "grad_norm": 8.386669158935547, + "learning_rate": 1.2406112426240907e-05, + "loss": 2.6748, + "step": 7256000 + }, + { + "epoch": 2.2557886867060324, + "grad_norm": 16.49373435974121, + "learning_rate": 1.2403521888232792e-05, + "loss": 2.6101, + "step": 7256500 + }, + { + "epoch": 2.2559441189865193, + "grad_norm": 12.149478912353516, + "learning_rate": 1.2400931350224678e-05, + "loss": 2.6575, + "step": 7257000 + }, + { + "epoch": 2.256099551267006, + "grad_norm": 11.61553955078125, + "learning_rate": 1.2398340812216563e-05, + "loss": 2.6056, + "step": 7257500 + }, + { + "epoch": 2.256254983547493, + "grad_norm": 52.01457595825195, + "learning_rate": 1.2395750274208449e-05, + "loss": 2.6231, + "step": 7258000 + }, + { + "epoch": 2.25641041582798, + "grad_norm": 9.398839950561523, + "learning_rate": 1.2393159736200334e-05, + "loss": 2.6239, + "step": 7258500 + }, + { + "epoch": 2.2565658481084667, + "grad_norm": 5.913853645324707, + "learning_rate": 1.239056919819222e-05, + "loss": 2.6027, + "step": 7259000 + }, + { + "epoch": 2.2567212803889536, + "grad_norm": 11.263340950012207, + "learning_rate": 1.2387978660184105e-05, + "loss": 2.7159, + "step": 7259500 + }, + { + "epoch": 2.2568767126694405, + "grad_norm": 11.988740921020508, + "learning_rate": 1.238538812217599e-05, + "loss": 2.6528, + "step": 7260000 + }, + { + "epoch": 2.2570321449499273, + "grad_norm": 11.836801528930664, + "learning_rate": 1.2382797584167876e-05, + "loss": 2.6702, + "step": 7260500 + }, + { + "epoch": 2.257187577230414, + "grad_norm": 31.629323959350586, + "learning_rate": 1.2380207046159761e-05, + "loss": 2.673, + "step": 7261000 + }, + { + "epoch": 2.257343009510901, + "grad_norm": 10.725226402282715, + "learning_rate": 1.2377616508151645e-05, + "loss": 2.6104, + "step": 7261500 + }, + { + "epoch": 2.2574984417913884, + "grad_norm": 9.46221923828125, + "learning_rate": 1.2375025970143532e-05, + "loss": 2.6872, + "step": 7262000 + }, + { + "epoch": 2.257653874071875, + "grad_norm": 11.816768646240234, + "learning_rate": 1.2372435432135418e-05, + "loss": 2.6486, + "step": 7262500 + }, + { + "epoch": 2.257809306352362, + "grad_norm": 11.15739631652832, + "learning_rate": 1.2369844894127303e-05, + "loss": 2.666, + "step": 7263000 + }, + { + "epoch": 2.2579647386328485, + "grad_norm": 10.160689353942871, + "learning_rate": 1.2367254356119189e-05, + "loss": 2.6286, + "step": 7263500 + }, + { + "epoch": 2.258120170913336, + "grad_norm": 10.370672225952148, + "learning_rate": 1.2364663818111072e-05, + "loss": 2.6576, + "step": 7264000 + }, + { + "epoch": 2.2582756031938223, + "grad_norm": 11.394867897033691, + "learning_rate": 1.236207328010296e-05, + "loss": 2.6856, + "step": 7264500 + }, + { + "epoch": 2.2584310354743096, + "grad_norm": 7.670248985290527, + "learning_rate": 1.2359482742094843e-05, + "loss": 2.6429, + "step": 7265000 + }, + { + "epoch": 2.2585864677547964, + "grad_norm": 8.52625560760498, + "learning_rate": 1.235689220408673e-05, + "loss": 2.6121, + "step": 7265500 + }, + { + "epoch": 2.2587419000352833, + "grad_norm": 9.33696460723877, + "learning_rate": 1.2354301666078614e-05, + "loss": 2.6539, + "step": 7266000 + }, + { + "epoch": 2.25889733231577, + "grad_norm": 11.138952255249023, + "learning_rate": 1.23517111280705e-05, + "loss": 2.6076, + "step": 7266500 + }, + { + "epoch": 2.259052764596257, + "grad_norm": 10.536751747131348, + "learning_rate": 1.2349120590062387e-05, + "loss": 2.6459, + "step": 7267000 + }, + { + "epoch": 2.259208196876744, + "grad_norm": 21.34053611755371, + "learning_rate": 1.234653005205427e-05, + "loss": 2.6196, + "step": 7267500 + }, + { + "epoch": 2.2593636291572308, + "grad_norm": 9.430716514587402, + "learning_rate": 1.2343939514046158e-05, + "loss": 2.6798, + "step": 7268000 + }, + { + "epoch": 2.2595190614377176, + "grad_norm": 19.72671127319336, + "learning_rate": 1.2341348976038042e-05, + "loss": 2.6293, + "step": 7268500 + }, + { + "epoch": 2.2596744937182045, + "grad_norm": 8.94085693359375, + "learning_rate": 1.2338758438029927e-05, + "loss": 2.6768, + "step": 7269000 + }, + { + "epoch": 2.2598299259986914, + "grad_norm": 9.635295867919922, + "learning_rate": 1.2336167900021813e-05, + "loss": 2.5963, + "step": 7269500 + }, + { + "epoch": 2.2599853582791782, + "grad_norm": 10.096888542175293, + "learning_rate": 1.2333577362013698e-05, + "loss": 2.6472, + "step": 7270000 + }, + { + "epoch": 2.260140790559665, + "grad_norm": 10.2374849319458, + "learning_rate": 1.2330986824005583e-05, + "loss": 2.6609, + "step": 7270500 + }, + { + "epoch": 2.260296222840152, + "grad_norm": 8.639080047607422, + "learning_rate": 1.2328396285997469e-05, + "loss": 2.5994, + "step": 7271000 + }, + { + "epoch": 2.260451655120639, + "grad_norm": 9.645292282104492, + "learning_rate": 1.2325805747989354e-05, + "loss": 2.6242, + "step": 7271500 + }, + { + "epoch": 2.2606070874011257, + "grad_norm": 10.03081226348877, + "learning_rate": 1.232321520998124e-05, + "loss": 2.6569, + "step": 7272000 + }, + { + "epoch": 2.2607625196816126, + "grad_norm": 9.364910125732422, + "learning_rate": 1.2320624671973125e-05, + "loss": 2.669, + "step": 7272500 + }, + { + "epoch": 2.2609179519620994, + "grad_norm": 9.72256088256836, + "learning_rate": 1.231803413396501e-05, + "loss": 2.6105, + "step": 7273000 + }, + { + "epoch": 2.2610733842425863, + "grad_norm": 13.482836723327637, + "learning_rate": 1.2315443595956896e-05, + "loss": 2.6579, + "step": 7273500 + }, + { + "epoch": 2.261228816523073, + "grad_norm": 10.700262069702148, + "learning_rate": 1.2312853057948782e-05, + "loss": 2.6246, + "step": 7274000 + }, + { + "epoch": 2.26138424880356, + "grad_norm": 9.73587703704834, + "learning_rate": 1.2310262519940667e-05, + "loss": 2.6281, + "step": 7274500 + }, + { + "epoch": 2.261539681084047, + "grad_norm": 10.296360969543457, + "learning_rate": 1.2307671981932551e-05, + "loss": 2.6353, + "step": 7275000 + }, + { + "epoch": 2.2616951133645338, + "grad_norm": 9.425524711608887, + "learning_rate": 1.2305081443924438e-05, + "loss": 2.7084, + "step": 7275500 + }, + { + "epoch": 2.2618505456450206, + "grad_norm": 12.219953536987305, + "learning_rate": 1.2302490905916324e-05, + "loss": 2.6547, + "step": 7276000 + }, + { + "epoch": 2.2620059779255075, + "grad_norm": 11.472938537597656, + "learning_rate": 1.2299900367908209e-05, + "loss": 2.6179, + "step": 7276500 + }, + { + "epoch": 2.2621614102059944, + "grad_norm": 24.51144790649414, + "learning_rate": 1.2297309829900094e-05, + "loss": 2.6372, + "step": 7277000 + }, + { + "epoch": 2.2623168424864812, + "grad_norm": 11.099923133850098, + "learning_rate": 1.2294719291891978e-05, + "loss": 2.6447, + "step": 7277500 + }, + { + "epoch": 2.262472274766968, + "grad_norm": 14.57610034942627, + "learning_rate": 1.2292128753883865e-05, + "loss": 2.6493, + "step": 7278000 + }, + { + "epoch": 2.262627707047455, + "grad_norm": 9.059966087341309, + "learning_rate": 1.2289538215875749e-05, + "loss": 2.6237, + "step": 7278500 + }, + { + "epoch": 2.262783139327942, + "grad_norm": 9.345511436462402, + "learning_rate": 1.2286947677867636e-05, + "loss": 2.6525, + "step": 7279000 + }, + { + "epoch": 2.2629385716084287, + "grad_norm": 8.096573829650879, + "learning_rate": 1.228435713985952e-05, + "loss": 2.6751, + "step": 7279500 + }, + { + "epoch": 2.2630940038889156, + "grad_norm": 10.070115089416504, + "learning_rate": 1.2281766601851405e-05, + "loss": 2.6544, + "step": 7280000 + }, + { + "epoch": 2.2632494361694024, + "grad_norm": 9.87381362915039, + "learning_rate": 1.2279176063843293e-05, + "loss": 2.661, + "step": 7280500 + }, + { + "epoch": 2.2634048684498893, + "grad_norm": 9.366229057312012, + "learning_rate": 1.2276585525835176e-05, + "loss": 2.6516, + "step": 7281000 + }, + { + "epoch": 2.263560300730376, + "grad_norm": 11.636372566223145, + "learning_rate": 1.2273994987827064e-05, + "loss": 2.72, + "step": 7281500 + }, + { + "epoch": 2.263715733010863, + "grad_norm": 11.512839317321777, + "learning_rate": 1.2271404449818947e-05, + "loss": 2.5716, + "step": 7282000 + }, + { + "epoch": 2.26387116529135, + "grad_norm": 9.296636581420898, + "learning_rate": 1.2268813911810833e-05, + "loss": 2.6481, + "step": 7282500 + }, + { + "epoch": 2.2640265975718368, + "grad_norm": 8.59511661529541, + "learning_rate": 1.2266223373802718e-05, + "loss": 2.6424, + "step": 7283000 + }, + { + "epoch": 2.2641820298523236, + "grad_norm": 7.826292991638184, + "learning_rate": 1.2263632835794604e-05, + "loss": 2.6239, + "step": 7283500 + }, + { + "epoch": 2.2643374621328105, + "grad_norm": 7.847853183746338, + "learning_rate": 1.226104229778649e-05, + "loss": 2.682, + "step": 7284000 + }, + { + "epoch": 2.2644928944132974, + "grad_norm": 7.530412197113037, + "learning_rate": 1.2258451759778375e-05, + "loss": 2.6469, + "step": 7284500 + }, + { + "epoch": 2.2646483266937842, + "grad_norm": 13.509123802185059, + "learning_rate": 1.225586122177026e-05, + "loss": 2.6283, + "step": 7285000 + }, + { + "epoch": 2.264803758974271, + "grad_norm": 18.328996658325195, + "learning_rate": 1.2253270683762146e-05, + "loss": 2.6041, + "step": 7285500 + }, + { + "epoch": 2.264959191254758, + "grad_norm": 8.551291465759277, + "learning_rate": 1.2250680145754031e-05, + "loss": 2.6549, + "step": 7286000 + }, + { + "epoch": 2.2651146235352453, + "grad_norm": 11.123204231262207, + "learning_rate": 1.2248089607745916e-05, + "loss": 2.623, + "step": 7286500 + }, + { + "epoch": 2.2652700558157317, + "grad_norm": 6.9245476722717285, + "learning_rate": 1.2245499069737802e-05, + "loss": 2.6615, + "step": 7287000 + }, + { + "epoch": 2.265425488096219, + "grad_norm": 9.84162425994873, + "learning_rate": 1.2242908531729687e-05, + "loss": 2.6645, + "step": 7287500 + }, + { + "epoch": 2.2655809203767054, + "grad_norm": 10.37584114074707, + "learning_rate": 1.2240317993721573e-05, + "loss": 2.6719, + "step": 7288000 + }, + { + "epoch": 2.2657363526571928, + "grad_norm": 8.836155891418457, + "learning_rate": 1.2237727455713458e-05, + "loss": 2.6474, + "step": 7288500 + }, + { + "epoch": 2.2658917849376796, + "grad_norm": 9.041139602661133, + "learning_rate": 1.2235136917705344e-05, + "loss": 2.6602, + "step": 7289000 + }, + { + "epoch": 2.2660472172181665, + "grad_norm": 8.118035316467285, + "learning_rate": 1.223254637969723e-05, + "loss": 2.6511, + "step": 7289500 + }, + { + "epoch": 2.2662026494986534, + "grad_norm": 10.00922679901123, + "learning_rate": 1.2229955841689115e-05, + "loss": 2.7228, + "step": 7290000 + }, + { + "epoch": 2.26635808177914, + "grad_norm": 8.790294647216797, + "learning_rate": 1.2227365303681e-05, + "loss": 2.6835, + "step": 7290500 + }, + { + "epoch": 2.266513514059627, + "grad_norm": 10.442645072937012, + "learning_rate": 1.2224774765672886e-05, + "loss": 2.6755, + "step": 7291000 + }, + { + "epoch": 2.266668946340114, + "grad_norm": 10.500576972961426, + "learning_rate": 1.2222184227664771e-05, + "loss": 2.6248, + "step": 7291500 + }, + { + "epoch": 2.266824378620601, + "grad_norm": 8.993570327758789, + "learning_rate": 1.2219593689656655e-05, + "loss": 2.6463, + "step": 7292000 + }, + { + "epoch": 2.2669798109010877, + "grad_norm": 13.083641052246094, + "learning_rate": 1.2217003151648542e-05, + "loss": 2.6329, + "step": 7292500 + }, + { + "epoch": 2.2671352431815746, + "grad_norm": 6.8183112144470215, + "learning_rate": 1.2214412613640426e-05, + "loss": 2.6468, + "step": 7293000 + }, + { + "epoch": 2.2672906754620614, + "grad_norm": 9.269789695739746, + "learning_rate": 1.2211822075632313e-05, + "loss": 2.64, + "step": 7293500 + }, + { + "epoch": 2.2674461077425483, + "grad_norm": 11.05223560333252, + "learning_rate": 1.2209231537624198e-05, + "loss": 2.6209, + "step": 7294000 + }, + { + "epoch": 2.267601540023035, + "grad_norm": 10.249068260192871, + "learning_rate": 1.2206640999616082e-05, + "loss": 2.6304, + "step": 7294500 + }, + { + "epoch": 2.267756972303522, + "grad_norm": 8.981354713439941, + "learning_rate": 1.220405046160797e-05, + "loss": 2.6081, + "step": 7295000 + }, + { + "epoch": 2.267912404584009, + "grad_norm": 10.590412139892578, + "learning_rate": 1.2201459923599853e-05, + "loss": 2.6062, + "step": 7295500 + }, + { + "epoch": 2.2680678368644958, + "grad_norm": 11.822092056274414, + "learning_rate": 1.219886938559174e-05, + "loss": 2.6221, + "step": 7296000 + }, + { + "epoch": 2.2682232691449826, + "grad_norm": 10.345192909240723, + "learning_rate": 1.2196278847583624e-05, + "loss": 2.6251, + "step": 7296500 + }, + { + "epoch": 2.2683787014254695, + "grad_norm": 13.47756290435791, + "learning_rate": 1.219368830957551e-05, + "loss": 2.64, + "step": 7297000 + }, + { + "epoch": 2.2685341337059564, + "grad_norm": 12.41489028930664, + "learning_rate": 1.2191097771567395e-05, + "loss": 2.6049, + "step": 7297500 + }, + { + "epoch": 2.2686895659864432, + "grad_norm": 15.658803939819336, + "learning_rate": 1.218850723355928e-05, + "loss": 2.6403, + "step": 7298000 + }, + { + "epoch": 2.26884499826693, + "grad_norm": 9.572999954223633, + "learning_rate": 1.2185916695551167e-05, + "loss": 2.6299, + "step": 7298500 + }, + { + "epoch": 2.269000430547417, + "grad_norm": 7.776844024658203, + "learning_rate": 1.2183326157543051e-05, + "loss": 2.6298, + "step": 7299000 + }, + { + "epoch": 2.269155862827904, + "grad_norm": 30.63597869873047, + "learning_rate": 1.2180735619534937e-05, + "loss": 2.6103, + "step": 7299500 + }, + { + "epoch": 2.2693112951083907, + "grad_norm": 8.556169509887695, + "learning_rate": 1.2178145081526822e-05, + "loss": 2.6269, + "step": 7300000 + }, + { + "epoch": 2.2694667273888776, + "grad_norm": 9.351836204528809, + "learning_rate": 1.2175554543518708e-05, + "loss": 2.6752, + "step": 7300500 + }, + { + "epoch": 2.2696221596693644, + "grad_norm": 14.174991607666016, + "learning_rate": 1.2172964005510593e-05, + "loss": 2.5985, + "step": 7301000 + }, + { + "epoch": 2.2697775919498513, + "grad_norm": 7.918892860412598, + "learning_rate": 1.2170373467502479e-05, + "loss": 2.6219, + "step": 7301500 + }, + { + "epoch": 2.269933024230338, + "grad_norm": 10.656506538391113, + "learning_rate": 1.2167782929494364e-05, + "loss": 2.6376, + "step": 7302000 + }, + { + "epoch": 2.270088456510825, + "grad_norm": 7.431649684906006, + "learning_rate": 1.216519239148625e-05, + "loss": 2.617, + "step": 7302500 + }, + { + "epoch": 2.270243888791312, + "grad_norm": 10.183456420898438, + "learning_rate": 1.2162601853478135e-05, + "loss": 2.6563, + "step": 7303000 + }, + { + "epoch": 2.2703993210717988, + "grad_norm": 11.950614929199219, + "learning_rate": 1.216001131547002e-05, + "loss": 2.6088, + "step": 7303500 + }, + { + "epoch": 2.2705547533522856, + "grad_norm": 9.161938667297363, + "learning_rate": 1.2157420777461906e-05, + "loss": 2.66, + "step": 7304000 + }, + { + "epoch": 2.2707101856327725, + "grad_norm": 11.596038818359375, + "learning_rate": 1.2154830239453791e-05, + "loss": 2.6548, + "step": 7304500 + }, + { + "epoch": 2.2708656179132594, + "grad_norm": 10.317688941955566, + "learning_rate": 1.2152239701445677e-05, + "loss": 2.6474, + "step": 7305000 + }, + { + "epoch": 2.2710210501937462, + "grad_norm": 11.505531311035156, + "learning_rate": 1.214964916343756e-05, + "loss": 2.6473, + "step": 7305500 + }, + { + "epoch": 2.271176482474233, + "grad_norm": 34.22753143310547, + "learning_rate": 1.2147058625429448e-05, + "loss": 2.6625, + "step": 7306000 + }, + { + "epoch": 2.27133191475472, + "grad_norm": 7.689677715301514, + "learning_rate": 1.2144468087421331e-05, + "loss": 2.644, + "step": 7306500 + }, + { + "epoch": 2.271487347035207, + "grad_norm": 8.577692985534668, + "learning_rate": 1.2141877549413219e-05, + "loss": 2.5996, + "step": 7307000 + }, + { + "epoch": 2.2716427793156937, + "grad_norm": 8.754859924316406, + "learning_rate": 1.2139287011405104e-05, + "loss": 2.6415, + "step": 7307500 + }, + { + "epoch": 2.2717982115961806, + "grad_norm": 9.18860149383545, + "learning_rate": 1.2136696473396988e-05, + "loss": 2.6179, + "step": 7308000 + }, + { + "epoch": 2.2719536438766674, + "grad_norm": 35.176841735839844, + "learning_rate": 1.2134105935388875e-05, + "loss": 2.6435, + "step": 7308500 + }, + { + "epoch": 2.2721090761571543, + "grad_norm": 11.864429473876953, + "learning_rate": 1.2131515397380759e-05, + "loss": 2.6441, + "step": 7309000 + }, + { + "epoch": 2.272264508437641, + "grad_norm": 11.020472526550293, + "learning_rate": 1.2128924859372646e-05, + "loss": 2.6619, + "step": 7309500 + }, + { + "epoch": 2.272419940718128, + "grad_norm": 8.71398639678955, + "learning_rate": 1.212633432136453e-05, + "loss": 2.6461, + "step": 7310000 + }, + { + "epoch": 2.272575372998615, + "grad_norm": 8.844053268432617, + "learning_rate": 1.2123743783356415e-05, + "loss": 2.6227, + "step": 7310500 + }, + { + "epoch": 2.272730805279102, + "grad_norm": 10.446518898010254, + "learning_rate": 1.21211532453483e-05, + "loss": 2.6575, + "step": 7311000 + }, + { + "epoch": 2.2728862375595886, + "grad_norm": 11.819648742675781, + "learning_rate": 1.2118562707340186e-05, + "loss": 2.6359, + "step": 7311500 + }, + { + "epoch": 2.273041669840076, + "grad_norm": 10.393851280212402, + "learning_rate": 1.2115972169332073e-05, + "loss": 2.6245, + "step": 7312000 + }, + { + "epoch": 2.2731971021205624, + "grad_norm": 9.416769027709961, + "learning_rate": 1.2113381631323957e-05, + "loss": 2.646, + "step": 7312500 + }, + { + "epoch": 2.2733525344010497, + "grad_norm": 14.643324851989746, + "learning_rate": 1.2110791093315842e-05, + "loss": 2.6384, + "step": 7313000 + }, + { + "epoch": 2.2735079666815365, + "grad_norm": 10.792739868164062, + "learning_rate": 1.2108200555307728e-05, + "loss": 2.6753, + "step": 7313500 + }, + { + "epoch": 2.2736633989620234, + "grad_norm": 9.051336288452148, + "learning_rate": 1.2105610017299613e-05, + "loss": 2.6827, + "step": 7314000 + }, + { + "epoch": 2.2738188312425103, + "grad_norm": 9.667468070983887, + "learning_rate": 1.2103019479291499e-05, + "loss": 2.6185, + "step": 7314500 + }, + { + "epoch": 2.273974263522997, + "grad_norm": 9.033743858337402, + "learning_rate": 1.2100428941283384e-05, + "loss": 2.6323, + "step": 7315000 + }, + { + "epoch": 2.274129695803484, + "grad_norm": 9.737778663635254, + "learning_rate": 1.209783840327527e-05, + "loss": 2.6462, + "step": 7315500 + }, + { + "epoch": 2.274285128083971, + "grad_norm": 9.016351699829102, + "learning_rate": 1.2095247865267155e-05, + "loss": 2.6597, + "step": 7316000 + }, + { + "epoch": 2.2744405603644577, + "grad_norm": 10.62580680847168, + "learning_rate": 1.209265732725904e-05, + "loss": 2.638, + "step": 7316500 + }, + { + "epoch": 2.2745959926449446, + "grad_norm": 10.623239517211914, + "learning_rate": 1.2090066789250926e-05, + "loss": 2.6568, + "step": 7317000 + }, + { + "epoch": 2.2747514249254315, + "grad_norm": 53.089813232421875, + "learning_rate": 1.2087476251242812e-05, + "loss": 2.645, + "step": 7317500 + }, + { + "epoch": 2.2749068572059183, + "grad_norm": 15.537055969238281, + "learning_rate": 1.2084885713234697e-05, + "loss": 2.6882, + "step": 7318000 + }, + { + "epoch": 2.275062289486405, + "grad_norm": 19.091533660888672, + "learning_rate": 1.2082295175226582e-05, + "loss": 2.6542, + "step": 7318500 + }, + { + "epoch": 2.275217721766892, + "grad_norm": 9.909636497497559, + "learning_rate": 1.2079704637218466e-05, + "loss": 2.6517, + "step": 7319000 + }, + { + "epoch": 2.275373154047379, + "grad_norm": 9.869256019592285, + "learning_rate": 1.2077114099210353e-05, + "loss": 2.6305, + "step": 7319500 + }, + { + "epoch": 2.275528586327866, + "grad_norm": 13.273931503295898, + "learning_rate": 1.2074523561202237e-05, + "loss": 2.5905, + "step": 7320000 + }, + { + "epoch": 2.2756840186083527, + "grad_norm": 11.242410659790039, + "learning_rate": 1.2071933023194124e-05, + "loss": 2.6612, + "step": 7320500 + }, + { + "epoch": 2.2758394508888395, + "grad_norm": 31.42948341369629, + "learning_rate": 1.206934248518601e-05, + "loss": 2.6531, + "step": 7321000 + }, + { + "epoch": 2.2759948831693264, + "grad_norm": 10.784209251403809, + "learning_rate": 1.2066751947177894e-05, + "loss": 2.6357, + "step": 7321500 + }, + { + "epoch": 2.2761503154498133, + "grad_norm": 10.878523826599121, + "learning_rate": 1.206416140916978e-05, + "loss": 2.6627, + "step": 7322000 + }, + { + "epoch": 2.2763057477303, + "grad_norm": 12.889009475708008, + "learning_rate": 1.2061570871161664e-05, + "loss": 2.6347, + "step": 7322500 + }, + { + "epoch": 2.276461180010787, + "grad_norm": 8.590786933898926, + "learning_rate": 1.2058980333153552e-05, + "loss": 2.6352, + "step": 7323000 + }, + { + "epoch": 2.276616612291274, + "grad_norm": 10.508933067321777, + "learning_rate": 1.2056389795145435e-05, + "loss": 2.6305, + "step": 7323500 + }, + { + "epoch": 2.2767720445717607, + "grad_norm": 12.304891586303711, + "learning_rate": 1.205379925713732e-05, + "loss": 2.6622, + "step": 7324000 + }, + { + "epoch": 2.2769274768522476, + "grad_norm": 8.002525329589844, + "learning_rate": 1.2051208719129206e-05, + "loss": 2.6334, + "step": 7324500 + }, + { + "epoch": 2.2770829091327345, + "grad_norm": 10.408443450927734, + "learning_rate": 1.2048618181121092e-05, + "loss": 2.6559, + "step": 7325000 + }, + { + "epoch": 2.2772383414132213, + "grad_norm": 9.524382591247559, + "learning_rate": 1.2046027643112979e-05, + "loss": 2.6508, + "step": 7325500 + }, + { + "epoch": 2.277393773693708, + "grad_norm": 10.80701732635498, + "learning_rate": 1.2043437105104863e-05, + "loss": 2.61, + "step": 7326000 + }, + { + "epoch": 2.277549205974195, + "grad_norm": 8.894804000854492, + "learning_rate": 1.2040846567096748e-05, + "loss": 2.6628, + "step": 7326500 + }, + { + "epoch": 2.277704638254682, + "grad_norm": 9.79461669921875, + "learning_rate": 1.2038256029088634e-05, + "loss": 2.6611, + "step": 7327000 + }, + { + "epoch": 2.277860070535169, + "grad_norm": 14.626721382141113, + "learning_rate": 1.2035665491080519e-05, + "loss": 2.606, + "step": 7327500 + }, + { + "epoch": 2.2780155028156557, + "grad_norm": 9.679969787597656, + "learning_rate": 1.2033074953072405e-05, + "loss": 2.6868, + "step": 7328000 + }, + { + "epoch": 2.2781709350961425, + "grad_norm": 8.872432708740234, + "learning_rate": 1.203048441506429e-05, + "loss": 2.6422, + "step": 7328500 + }, + { + "epoch": 2.2783263673766294, + "grad_norm": 17.106014251708984, + "learning_rate": 1.2027893877056175e-05, + "loss": 2.6232, + "step": 7329000 + }, + { + "epoch": 2.2784817996571163, + "grad_norm": 10.413832664489746, + "learning_rate": 1.2025303339048061e-05, + "loss": 2.7312, + "step": 7329500 + }, + { + "epoch": 2.278637231937603, + "grad_norm": 10.852895736694336, + "learning_rate": 1.2022712801039946e-05, + "loss": 2.6363, + "step": 7330000 + }, + { + "epoch": 2.27879266421809, + "grad_norm": 8.364068984985352, + "learning_rate": 1.2020122263031832e-05, + "loss": 2.625, + "step": 7330500 + }, + { + "epoch": 2.278948096498577, + "grad_norm": 9.380669593811035, + "learning_rate": 1.2017531725023717e-05, + "loss": 2.6459, + "step": 7331000 + }, + { + "epoch": 2.2791035287790637, + "grad_norm": 12.007654190063477, + "learning_rate": 1.2014941187015603e-05, + "loss": 2.6414, + "step": 7331500 + }, + { + "epoch": 2.2792589610595506, + "grad_norm": 12.13021183013916, + "learning_rate": 1.2012350649007488e-05, + "loss": 2.6524, + "step": 7332000 + }, + { + "epoch": 2.2794143933400375, + "grad_norm": 9.385856628417969, + "learning_rate": 1.2009760110999372e-05, + "loss": 2.6631, + "step": 7332500 + }, + { + "epoch": 2.2795698256205243, + "grad_norm": 10.469488143920898, + "learning_rate": 1.2007169572991259e-05, + "loss": 2.6913, + "step": 7333000 + }, + { + "epoch": 2.279725257901011, + "grad_norm": 9.293136596679688, + "learning_rate": 1.2004579034983143e-05, + "loss": 2.6072, + "step": 7333500 + }, + { + "epoch": 2.279880690181498, + "grad_norm": 9.023438453674316, + "learning_rate": 1.200198849697503e-05, + "loss": 2.6509, + "step": 7334000 + }, + { + "epoch": 2.2800361224619854, + "grad_norm": 10.7387056350708, + "learning_rate": 1.1999397958966915e-05, + "loss": 2.6639, + "step": 7334500 + }, + { + "epoch": 2.280191554742472, + "grad_norm": 10.561515808105469, + "learning_rate": 1.19968074209588e-05, + "loss": 2.6573, + "step": 7335000 + }, + { + "epoch": 2.280346987022959, + "grad_norm": 10.510430335998535, + "learning_rate": 1.1994216882950686e-05, + "loss": 2.6402, + "step": 7335500 + }, + { + "epoch": 2.2805024193034455, + "grad_norm": 9.133667945861816, + "learning_rate": 1.199162634494257e-05, + "loss": 2.5981, + "step": 7336000 + }, + { + "epoch": 2.280657851583933, + "grad_norm": 14.847354888916016, + "learning_rate": 1.1989035806934457e-05, + "loss": 2.6752, + "step": 7336500 + }, + { + "epoch": 2.2808132838644193, + "grad_norm": 17.029680252075195, + "learning_rate": 1.1986445268926341e-05, + "loss": 2.643, + "step": 7337000 + }, + { + "epoch": 2.2809687161449066, + "grad_norm": 78.86688232421875, + "learning_rate": 1.1983854730918227e-05, + "loss": 2.67, + "step": 7337500 + }, + { + "epoch": 2.2811241484253935, + "grad_norm": 10.898015975952148, + "learning_rate": 1.1981264192910112e-05, + "loss": 2.6289, + "step": 7338000 + }, + { + "epoch": 2.2812795807058803, + "grad_norm": 28.001758575439453, + "learning_rate": 1.1978673654901997e-05, + "loss": 2.6758, + "step": 7338500 + }, + { + "epoch": 2.281435012986367, + "grad_norm": 10.46817684173584, + "learning_rate": 1.1976083116893885e-05, + "loss": 2.6054, + "step": 7339000 + }, + { + "epoch": 2.281590445266854, + "grad_norm": 9.844569206237793, + "learning_rate": 1.1973492578885768e-05, + "loss": 2.5881, + "step": 7339500 + }, + { + "epoch": 2.281745877547341, + "grad_norm": 22.370376586914062, + "learning_rate": 1.1970902040877654e-05, + "loss": 2.6976, + "step": 7340000 + }, + { + "epoch": 2.281901309827828, + "grad_norm": 15.053243637084961, + "learning_rate": 1.196831150286954e-05, + "loss": 2.6081, + "step": 7340500 + }, + { + "epoch": 2.2820567421083147, + "grad_norm": 8.62396240234375, + "learning_rate": 1.1965720964861425e-05, + "loss": 2.6323, + "step": 7341000 + }, + { + "epoch": 2.2822121743888015, + "grad_norm": 18.34342384338379, + "learning_rate": 1.196313042685331e-05, + "loss": 2.6898, + "step": 7341500 + }, + { + "epoch": 2.2823676066692884, + "grad_norm": 11.2401704788208, + "learning_rate": 1.1960539888845196e-05, + "loss": 2.6798, + "step": 7342000 + }, + { + "epoch": 2.2825230389497753, + "grad_norm": 8.266865730285645, + "learning_rate": 1.1957949350837081e-05, + "loss": 2.6401, + "step": 7342500 + }, + { + "epoch": 2.282678471230262, + "grad_norm": 10.361222267150879, + "learning_rate": 1.1955358812828967e-05, + "loss": 2.6639, + "step": 7343000 + }, + { + "epoch": 2.282833903510749, + "grad_norm": 9.902595520019531, + "learning_rate": 1.1952768274820852e-05, + "loss": 2.6262, + "step": 7343500 + }, + { + "epoch": 2.282989335791236, + "grad_norm": 10.80516242980957, + "learning_rate": 1.1950177736812738e-05, + "loss": 2.6734, + "step": 7344000 + }, + { + "epoch": 2.2831447680717227, + "grad_norm": 9.079398155212402, + "learning_rate": 1.1947587198804623e-05, + "loss": 2.6187, + "step": 7344500 + }, + { + "epoch": 2.2833002003522096, + "grad_norm": 13.6677885055542, + "learning_rate": 1.1944996660796508e-05, + "loss": 2.6448, + "step": 7345000 + }, + { + "epoch": 2.2834556326326965, + "grad_norm": 9.185169219970703, + "learning_rate": 1.1942406122788394e-05, + "loss": 2.6321, + "step": 7345500 + }, + { + "epoch": 2.2836110649131833, + "grad_norm": 9.78406047821045, + "learning_rate": 1.1939815584780278e-05, + "loss": 2.6128, + "step": 7346000 + }, + { + "epoch": 2.28376649719367, + "grad_norm": 9.7960786819458, + "learning_rate": 1.1937225046772165e-05, + "loss": 2.6085, + "step": 7346500 + }, + { + "epoch": 2.283921929474157, + "grad_norm": 39.777381896972656, + "learning_rate": 1.1934634508764049e-05, + "loss": 2.6797, + "step": 7347000 + }, + { + "epoch": 2.284077361754644, + "grad_norm": 8.798980712890625, + "learning_rate": 1.1932043970755936e-05, + "loss": 2.6496, + "step": 7347500 + }, + { + "epoch": 2.284232794035131, + "grad_norm": 9.014713287353516, + "learning_rate": 1.1929453432747821e-05, + "loss": 2.6348, + "step": 7348000 + }, + { + "epoch": 2.2843882263156177, + "grad_norm": 9.963129043579102, + "learning_rate": 1.1926862894739705e-05, + "loss": 2.6138, + "step": 7348500 + }, + { + "epoch": 2.2845436585961045, + "grad_norm": 15.82121753692627, + "learning_rate": 1.1924272356731592e-05, + "loss": 2.6318, + "step": 7349000 + }, + { + "epoch": 2.2846990908765914, + "grad_norm": 7.427286624908447, + "learning_rate": 1.1921681818723476e-05, + "loss": 2.5616, + "step": 7349500 + }, + { + "epoch": 2.2848545231570783, + "grad_norm": 8.082317352294922, + "learning_rate": 1.1919091280715363e-05, + "loss": 2.6545, + "step": 7350000 + }, + { + "epoch": 2.285009955437565, + "grad_norm": 17.013952255249023, + "learning_rate": 1.1916500742707247e-05, + "loss": 2.6034, + "step": 7350500 + }, + { + "epoch": 2.285165387718052, + "grad_norm": 9.400884628295898, + "learning_rate": 1.1913910204699132e-05, + "loss": 2.6428, + "step": 7351000 + }, + { + "epoch": 2.285320819998539, + "grad_norm": 14.106953620910645, + "learning_rate": 1.1911319666691018e-05, + "loss": 2.6546, + "step": 7351500 + }, + { + "epoch": 2.2854762522790257, + "grad_norm": 10.106707572937012, + "learning_rate": 1.1908729128682903e-05, + "loss": 2.6478, + "step": 7352000 + }, + { + "epoch": 2.2856316845595126, + "grad_norm": 11.040175437927246, + "learning_rate": 1.190613859067479e-05, + "loss": 2.6436, + "step": 7352500 + }, + { + "epoch": 2.2857871168399995, + "grad_norm": 8.643797874450684, + "learning_rate": 1.1903548052666674e-05, + "loss": 2.6052, + "step": 7353000 + }, + { + "epoch": 2.2859425491204863, + "grad_norm": 8.832962036132812, + "learning_rate": 1.190095751465856e-05, + "loss": 2.6221, + "step": 7353500 + }, + { + "epoch": 2.286097981400973, + "grad_norm": 23.48164939880371, + "learning_rate": 1.1898366976650445e-05, + "loss": 2.6504, + "step": 7354000 + }, + { + "epoch": 2.28625341368146, + "grad_norm": 9.990300178527832, + "learning_rate": 1.189577643864233e-05, + "loss": 2.6973, + "step": 7354500 + }, + { + "epoch": 2.286408845961947, + "grad_norm": 10.628157615661621, + "learning_rate": 1.1893185900634216e-05, + "loss": 2.6893, + "step": 7355000 + }, + { + "epoch": 2.286564278242434, + "grad_norm": 11.612908363342285, + "learning_rate": 1.1890595362626101e-05, + "loss": 2.6546, + "step": 7355500 + }, + { + "epoch": 2.2867197105229207, + "grad_norm": 9.228711128234863, + "learning_rate": 1.1888004824617987e-05, + "loss": 2.6605, + "step": 7356000 + }, + { + "epoch": 2.2868751428034075, + "grad_norm": 10.910202026367188, + "learning_rate": 1.1885414286609872e-05, + "loss": 2.6525, + "step": 7356500 + }, + { + "epoch": 2.2870305750838944, + "grad_norm": 18.09401512145996, + "learning_rate": 1.1882823748601758e-05, + "loss": 2.5947, + "step": 7357000 + }, + { + "epoch": 2.2871860073643813, + "grad_norm": 9.234265327453613, + "learning_rate": 1.1880233210593643e-05, + "loss": 2.6775, + "step": 7357500 + }, + { + "epoch": 2.287341439644868, + "grad_norm": 10.83712387084961, + "learning_rate": 1.1877642672585529e-05, + "loss": 2.5779, + "step": 7358000 + }, + { + "epoch": 2.287496871925355, + "grad_norm": 9.771502494812012, + "learning_rate": 1.1875052134577414e-05, + "loss": 2.6282, + "step": 7358500 + }, + { + "epoch": 2.2876523042058423, + "grad_norm": 9.32650375366211, + "learning_rate": 1.18724615965693e-05, + "loss": 2.6969, + "step": 7359000 + }, + { + "epoch": 2.2878077364863287, + "grad_norm": 11.643510818481445, + "learning_rate": 1.1869871058561183e-05, + "loss": 2.6532, + "step": 7359500 + }, + { + "epoch": 2.287963168766816, + "grad_norm": 9.58353042602539, + "learning_rate": 1.186728052055307e-05, + "loss": 2.6393, + "step": 7360000 + }, + { + "epoch": 2.2881186010473025, + "grad_norm": 9.882546424865723, + "learning_rate": 1.1864689982544954e-05, + "loss": 2.6222, + "step": 7360500 + }, + { + "epoch": 2.28827403332779, + "grad_norm": 7.5372538566589355, + "learning_rate": 1.1862099444536841e-05, + "loss": 2.6304, + "step": 7361000 + }, + { + "epoch": 2.2884294656082766, + "grad_norm": 8.517253875732422, + "learning_rate": 1.1859508906528727e-05, + "loss": 2.6022, + "step": 7361500 + }, + { + "epoch": 2.2885848978887635, + "grad_norm": 13.544057846069336, + "learning_rate": 1.185691836852061e-05, + "loss": 2.6274, + "step": 7362000 + }, + { + "epoch": 2.2887403301692504, + "grad_norm": 9.599526405334473, + "learning_rate": 1.1854327830512498e-05, + "loss": 2.6356, + "step": 7362500 + }, + { + "epoch": 2.2888957624497372, + "grad_norm": 8.459321975708008, + "learning_rate": 1.1851737292504382e-05, + "loss": 2.6406, + "step": 7363000 + }, + { + "epoch": 2.289051194730224, + "grad_norm": 9.713486671447754, + "learning_rate": 1.1849146754496269e-05, + "loss": 2.6064, + "step": 7363500 + }, + { + "epoch": 2.289206627010711, + "grad_norm": 8.033989906311035, + "learning_rate": 1.1846556216488152e-05, + "loss": 2.6018, + "step": 7364000 + }, + { + "epoch": 2.289362059291198, + "grad_norm": 18.663394927978516, + "learning_rate": 1.1843965678480038e-05, + "loss": 2.6672, + "step": 7364500 + }, + { + "epoch": 2.2895174915716847, + "grad_norm": 9.408242225646973, + "learning_rate": 1.1841375140471923e-05, + "loss": 2.6841, + "step": 7365000 + }, + { + "epoch": 2.2896729238521716, + "grad_norm": 8.333396911621094, + "learning_rate": 1.1838784602463809e-05, + "loss": 2.6224, + "step": 7365500 + }, + { + "epoch": 2.2898283561326584, + "grad_norm": 6.264537334442139, + "learning_rate": 1.1836194064455696e-05, + "loss": 2.6363, + "step": 7366000 + }, + { + "epoch": 2.2899837884131453, + "grad_norm": 10.040497779846191, + "learning_rate": 1.183360352644758e-05, + "loss": 2.6302, + "step": 7366500 + }, + { + "epoch": 2.290139220693632, + "grad_norm": 14.074384689331055, + "learning_rate": 1.1831012988439465e-05, + "loss": 2.5816, + "step": 7367000 + }, + { + "epoch": 2.290294652974119, + "grad_norm": 8.930139541625977, + "learning_rate": 1.182842245043135e-05, + "loss": 2.6267, + "step": 7367500 + }, + { + "epoch": 2.290450085254606, + "grad_norm": 9.924764633178711, + "learning_rate": 1.1825831912423236e-05, + "loss": 2.6234, + "step": 7368000 + }, + { + "epoch": 2.290605517535093, + "grad_norm": 9.90870475769043, + "learning_rate": 1.1823241374415122e-05, + "loss": 2.6531, + "step": 7368500 + }, + { + "epoch": 2.2907609498155797, + "grad_norm": 38.056976318359375, + "learning_rate": 1.1820650836407007e-05, + "loss": 2.6688, + "step": 7369000 + }, + { + "epoch": 2.2909163820960665, + "grad_norm": 9.45627498626709, + "learning_rate": 1.1818060298398893e-05, + "loss": 2.5985, + "step": 7369500 + }, + { + "epoch": 2.2910718143765534, + "grad_norm": 9.272185325622559, + "learning_rate": 1.1815469760390778e-05, + "loss": 2.6347, + "step": 7370000 + }, + { + "epoch": 2.2912272466570403, + "grad_norm": 10.472084045410156, + "learning_rate": 1.1812879222382663e-05, + "loss": 2.613, + "step": 7370500 + }, + { + "epoch": 2.291382678937527, + "grad_norm": 7.898168087005615, + "learning_rate": 1.1810288684374549e-05, + "loss": 2.6256, + "step": 7371000 + }, + { + "epoch": 2.291538111218014, + "grad_norm": 9.883302688598633, + "learning_rate": 1.1807698146366434e-05, + "loss": 2.6131, + "step": 7371500 + }, + { + "epoch": 2.291693543498501, + "grad_norm": 15.978287696838379, + "learning_rate": 1.180510760835832e-05, + "loss": 2.6373, + "step": 7372000 + }, + { + "epoch": 2.2918489757789877, + "grad_norm": 14.343766212463379, + "learning_rate": 1.1802517070350205e-05, + "loss": 2.6355, + "step": 7372500 + }, + { + "epoch": 2.2920044080594746, + "grad_norm": 9.453519821166992, + "learning_rate": 1.1799926532342089e-05, + "loss": 2.5924, + "step": 7373000 + }, + { + "epoch": 2.2921598403399615, + "grad_norm": 7.956838607788086, + "learning_rate": 1.1797335994333976e-05, + "loss": 2.6481, + "step": 7373500 + }, + { + "epoch": 2.2923152726204483, + "grad_norm": 11.194558143615723, + "learning_rate": 1.1794745456325862e-05, + "loss": 2.6255, + "step": 7374000 + }, + { + "epoch": 2.292470704900935, + "grad_norm": 8.869857788085938, + "learning_rate": 1.1792154918317747e-05, + "loss": 2.6669, + "step": 7374500 + }, + { + "epoch": 2.292626137181422, + "grad_norm": 10.015215873718262, + "learning_rate": 1.1789564380309633e-05, + "loss": 2.6252, + "step": 7375000 + }, + { + "epoch": 2.292781569461909, + "grad_norm": 9.815710067749023, + "learning_rate": 1.1786973842301516e-05, + "loss": 2.6325, + "step": 7375500 + }, + { + "epoch": 2.292937001742396, + "grad_norm": 8.389053344726562, + "learning_rate": 1.1784383304293404e-05, + "loss": 2.6694, + "step": 7376000 + }, + { + "epoch": 2.2930924340228827, + "grad_norm": 9.91069221496582, + "learning_rate": 1.1781792766285287e-05, + "loss": 2.681, + "step": 7376500 + }, + { + "epoch": 2.2932478663033695, + "grad_norm": 9.795629501342773, + "learning_rate": 1.1779202228277174e-05, + "loss": 2.6404, + "step": 7377000 + }, + { + "epoch": 2.2934032985838564, + "grad_norm": 8.886302947998047, + "learning_rate": 1.1776611690269058e-05, + "loss": 2.6599, + "step": 7377500 + }, + { + "epoch": 2.2935587308643433, + "grad_norm": 7.904273986816406, + "learning_rate": 1.1774021152260944e-05, + "loss": 2.633, + "step": 7378000 + }, + { + "epoch": 2.29371416314483, + "grad_norm": 10.323083877563477, + "learning_rate": 1.177143061425283e-05, + "loss": 2.6314, + "step": 7378500 + }, + { + "epoch": 2.293869595425317, + "grad_norm": 57.3663215637207, + "learning_rate": 1.1768840076244715e-05, + "loss": 2.6341, + "step": 7379000 + }, + { + "epoch": 2.294025027705804, + "grad_norm": 11.453239440917969, + "learning_rate": 1.1766249538236602e-05, + "loss": 2.604, + "step": 7379500 + }, + { + "epoch": 2.2941804599862907, + "grad_norm": 9.662607192993164, + "learning_rate": 1.1763659000228485e-05, + "loss": 2.6249, + "step": 7380000 + }, + { + "epoch": 2.2943358922667776, + "grad_norm": 9.82848072052002, + "learning_rate": 1.1761068462220371e-05, + "loss": 2.6972, + "step": 7380500 + }, + { + "epoch": 2.2944913245472645, + "grad_norm": 11.72391414642334, + "learning_rate": 1.1758477924212256e-05, + "loss": 2.6192, + "step": 7381000 + }, + { + "epoch": 2.2946467568277513, + "grad_norm": 11.209382057189941, + "learning_rate": 1.1755887386204142e-05, + "loss": 2.6074, + "step": 7381500 + }, + { + "epoch": 2.294802189108238, + "grad_norm": 14.482561111450195, + "learning_rate": 1.1753296848196027e-05, + "loss": 2.6588, + "step": 7382000 + }, + { + "epoch": 2.2949576213887255, + "grad_norm": 8.399670600891113, + "learning_rate": 1.1750706310187913e-05, + "loss": 2.6317, + "step": 7382500 + }, + { + "epoch": 2.295113053669212, + "grad_norm": 10.764612197875977, + "learning_rate": 1.1748115772179798e-05, + "loss": 2.6726, + "step": 7383000 + }, + { + "epoch": 2.2952684859496992, + "grad_norm": 12.036933898925781, + "learning_rate": 1.1745525234171684e-05, + "loss": 2.6421, + "step": 7383500 + }, + { + "epoch": 2.2954239182301857, + "grad_norm": 8.614667892456055, + "learning_rate": 1.174293469616357e-05, + "loss": 2.603, + "step": 7384000 + }, + { + "epoch": 2.295579350510673, + "grad_norm": 11.93481159210205, + "learning_rate": 1.1740344158155455e-05, + "loss": 2.669, + "step": 7384500 + }, + { + "epoch": 2.2957347827911594, + "grad_norm": 7.918427467346191, + "learning_rate": 1.173775362014734e-05, + "loss": 2.6503, + "step": 7385000 + }, + { + "epoch": 2.2958902150716467, + "grad_norm": 9.95085334777832, + "learning_rate": 1.1735163082139226e-05, + "loss": 2.6471, + "step": 7385500 + }, + { + "epoch": 2.2960456473521336, + "grad_norm": 46.56087112426758, + "learning_rate": 1.1732572544131111e-05, + "loss": 2.6195, + "step": 7386000 + }, + { + "epoch": 2.2962010796326204, + "grad_norm": 15.341014862060547, + "learning_rate": 1.1729982006122996e-05, + "loss": 2.6468, + "step": 7386500 + }, + { + "epoch": 2.2963565119131073, + "grad_norm": 8.97715950012207, + "learning_rate": 1.1727391468114882e-05, + "loss": 2.6558, + "step": 7387000 + }, + { + "epoch": 2.296511944193594, + "grad_norm": 11.864767074584961, + "learning_rate": 1.1724800930106767e-05, + "loss": 2.6164, + "step": 7387500 + }, + { + "epoch": 2.296667376474081, + "grad_norm": 7.02922248840332, + "learning_rate": 1.1722210392098653e-05, + "loss": 2.6293, + "step": 7388000 + }, + { + "epoch": 2.296822808754568, + "grad_norm": 8.59057331085205, + "learning_rate": 1.1719619854090538e-05, + "loss": 2.6143, + "step": 7388500 + }, + { + "epoch": 2.2969782410350548, + "grad_norm": 9.40333080291748, + "learning_rate": 1.1717029316082424e-05, + "loss": 2.6841, + "step": 7389000 + }, + { + "epoch": 2.2971336733155416, + "grad_norm": 10.107288360595703, + "learning_rate": 1.171443877807431e-05, + "loss": 2.6385, + "step": 7389500 + }, + { + "epoch": 2.2972891055960285, + "grad_norm": 9.989570617675781, + "learning_rate": 1.1711848240066193e-05, + "loss": 2.6146, + "step": 7390000 + }, + { + "epoch": 2.2974445378765154, + "grad_norm": 14.210516929626465, + "learning_rate": 1.170925770205808e-05, + "loss": 2.6168, + "step": 7390500 + }, + { + "epoch": 2.2975999701570022, + "grad_norm": 8.992165565490723, + "learning_rate": 1.1706667164049964e-05, + "loss": 2.6364, + "step": 7391000 + }, + { + "epoch": 2.297755402437489, + "grad_norm": 8.616151809692383, + "learning_rate": 1.1704076626041851e-05, + "loss": 2.6233, + "step": 7391500 + }, + { + "epoch": 2.297910834717976, + "grad_norm": 9.122289657592773, + "learning_rate": 1.1701486088033737e-05, + "loss": 2.6336, + "step": 7392000 + }, + { + "epoch": 2.298066266998463, + "grad_norm": 8.366823196411133, + "learning_rate": 1.169889555002562e-05, + "loss": 2.6045, + "step": 7392500 + }, + { + "epoch": 2.2982216992789497, + "grad_norm": 10.597240447998047, + "learning_rate": 1.1696305012017507e-05, + "loss": 2.6259, + "step": 7393000 + }, + { + "epoch": 2.2983771315594366, + "grad_norm": 9.460378646850586, + "learning_rate": 1.1693714474009391e-05, + "loss": 2.6297, + "step": 7393500 + }, + { + "epoch": 2.2985325638399234, + "grad_norm": 12.240315437316895, + "learning_rate": 1.1691123936001278e-05, + "loss": 2.6326, + "step": 7394000 + }, + { + "epoch": 2.2986879961204103, + "grad_norm": 11.929838180541992, + "learning_rate": 1.1688533397993162e-05, + "loss": 2.6456, + "step": 7394500 + }, + { + "epoch": 2.298843428400897, + "grad_norm": 10.309775352478027, + "learning_rate": 1.1685942859985048e-05, + "loss": 2.6372, + "step": 7395000 + }, + { + "epoch": 2.298998860681384, + "grad_norm": 6.950466632843018, + "learning_rate": 1.1683352321976933e-05, + "loss": 2.6108, + "step": 7395500 + }, + { + "epoch": 2.299154292961871, + "grad_norm": 8.894598007202148, + "learning_rate": 1.1680761783968819e-05, + "loss": 2.6012, + "step": 7396000 + }, + { + "epoch": 2.2993097252423578, + "grad_norm": 18.088970184326172, + "learning_rate": 1.1678171245960706e-05, + "loss": 2.6256, + "step": 7396500 + }, + { + "epoch": 2.2994651575228446, + "grad_norm": 8.75010871887207, + "learning_rate": 1.167558070795259e-05, + "loss": 2.6747, + "step": 7397000 + }, + { + "epoch": 2.2996205898033315, + "grad_norm": 10.019926071166992, + "learning_rate": 1.1672990169944475e-05, + "loss": 2.6174, + "step": 7397500 + }, + { + "epoch": 2.2997760220838184, + "grad_norm": 8.91125202178955, + "learning_rate": 1.167039963193636e-05, + "loss": 2.668, + "step": 7398000 + }, + { + "epoch": 2.2999314543643052, + "grad_norm": 9.99581527709961, + "learning_rate": 1.1667809093928246e-05, + "loss": 2.6131, + "step": 7398500 + }, + { + "epoch": 2.300086886644792, + "grad_norm": 11.691648483276367, + "learning_rate": 1.1665218555920131e-05, + "loss": 2.5944, + "step": 7399000 + }, + { + "epoch": 2.300242318925279, + "grad_norm": 10.35532283782959, + "learning_rate": 1.1662628017912017e-05, + "loss": 2.6414, + "step": 7399500 + }, + { + "epoch": 2.300397751205766, + "grad_norm": 46.20178985595703, + "learning_rate": 1.1660037479903902e-05, + "loss": 2.6197, + "step": 7400000 + }, + { + "epoch": 2.3005531834862527, + "grad_norm": 42.288841247558594, + "learning_rate": 1.1657446941895788e-05, + "loss": 2.6232, + "step": 7400500 + }, + { + "epoch": 2.3007086157667396, + "grad_norm": 8.870542526245117, + "learning_rate": 1.1654856403887673e-05, + "loss": 2.6443, + "step": 7401000 + }, + { + "epoch": 2.3008640480472264, + "grad_norm": 8.563353538513184, + "learning_rate": 1.1652265865879559e-05, + "loss": 2.6224, + "step": 7401500 + }, + { + "epoch": 2.3010194803277133, + "grad_norm": 11.569518089294434, + "learning_rate": 1.1649675327871444e-05, + "loss": 2.6513, + "step": 7402000 + }, + { + "epoch": 2.3011749126082, + "grad_norm": 8.892836570739746, + "learning_rate": 1.164708478986333e-05, + "loss": 2.6093, + "step": 7402500 + }, + { + "epoch": 2.301330344888687, + "grad_norm": 7.4568281173706055, + "learning_rate": 1.1644494251855215e-05, + "loss": 2.6568, + "step": 7403000 + }, + { + "epoch": 2.301485777169174, + "grad_norm": 8.596475601196289, + "learning_rate": 1.1641903713847099e-05, + "loss": 2.5962, + "step": 7403500 + }, + { + "epoch": 2.3016412094496608, + "grad_norm": 10.636489868164062, + "learning_rate": 1.1639313175838986e-05, + "loss": 2.6495, + "step": 7404000 + }, + { + "epoch": 2.3017966417301476, + "grad_norm": 11.385099411010742, + "learning_rate": 1.163672263783087e-05, + "loss": 2.638, + "step": 7404500 + }, + { + "epoch": 2.3019520740106345, + "grad_norm": 11.482050895690918, + "learning_rate": 1.1634132099822757e-05, + "loss": 2.5782, + "step": 7405000 + }, + { + "epoch": 2.3021075062911214, + "grad_norm": 10.950020790100098, + "learning_rate": 1.1631541561814642e-05, + "loss": 2.666, + "step": 7405500 + }, + { + "epoch": 2.3022629385716082, + "grad_norm": 9.817342758178711, + "learning_rate": 1.1628951023806526e-05, + "loss": 2.6194, + "step": 7406000 + }, + { + "epoch": 2.302418370852095, + "grad_norm": 9.496191024780273, + "learning_rate": 1.1626360485798413e-05, + "loss": 2.6726, + "step": 7406500 + }, + { + "epoch": 2.3025738031325824, + "grad_norm": 15.770816802978516, + "learning_rate": 1.1623769947790297e-05, + "loss": 2.6365, + "step": 7407000 + }, + { + "epoch": 2.302729235413069, + "grad_norm": 10.028091430664062, + "learning_rate": 1.1621179409782184e-05, + "loss": 2.6306, + "step": 7407500 + }, + { + "epoch": 2.302884667693556, + "grad_norm": 8.614579200744629, + "learning_rate": 1.1618588871774068e-05, + "loss": 2.6612, + "step": 7408000 + }, + { + "epoch": 2.3030400999740426, + "grad_norm": 11.648008346557617, + "learning_rate": 1.1615998333765953e-05, + "loss": 2.5726, + "step": 7408500 + }, + { + "epoch": 2.30319553225453, + "grad_norm": 11.516250610351562, + "learning_rate": 1.1613407795757839e-05, + "loss": 2.656, + "step": 7409000 + }, + { + "epoch": 2.3033509645350168, + "grad_norm": 13.90493106842041, + "learning_rate": 1.1610817257749724e-05, + "loss": 2.6493, + "step": 7409500 + }, + { + "epoch": 2.3035063968155036, + "grad_norm": 10.165961265563965, + "learning_rate": 1.1608226719741611e-05, + "loss": 2.6171, + "step": 7410000 + }, + { + "epoch": 2.3036618290959905, + "grad_norm": 9.992165565490723, + "learning_rate": 1.1605636181733495e-05, + "loss": 2.6525, + "step": 7410500 + }, + { + "epoch": 2.3038172613764774, + "grad_norm": 22.210325241088867, + "learning_rate": 1.160304564372538e-05, + "loss": 2.6427, + "step": 7411000 + }, + { + "epoch": 2.303972693656964, + "grad_norm": 10.602191925048828, + "learning_rate": 1.1600455105717266e-05, + "loss": 2.6267, + "step": 7411500 + }, + { + "epoch": 2.304128125937451, + "grad_norm": 10.506535530090332, + "learning_rate": 1.1597864567709152e-05, + "loss": 2.6588, + "step": 7412000 + }, + { + "epoch": 2.304283558217938, + "grad_norm": 8.109901428222656, + "learning_rate": 1.1595274029701037e-05, + "loss": 2.6287, + "step": 7412500 + }, + { + "epoch": 2.304438990498425, + "grad_norm": 10.383133888244629, + "learning_rate": 1.1592683491692922e-05, + "loss": 2.6198, + "step": 7413000 + }, + { + "epoch": 2.3045944227789117, + "grad_norm": 9.42286205291748, + "learning_rate": 1.1590092953684808e-05, + "loss": 2.6438, + "step": 7413500 + }, + { + "epoch": 2.3047498550593986, + "grad_norm": 9.18208122253418, + "learning_rate": 1.1587502415676693e-05, + "loss": 2.7108, + "step": 7414000 + }, + { + "epoch": 2.3049052873398854, + "grad_norm": 7.650609493255615, + "learning_rate": 1.1584911877668579e-05, + "loss": 2.6857, + "step": 7414500 + }, + { + "epoch": 2.3050607196203723, + "grad_norm": 9.502556800842285, + "learning_rate": 1.1582321339660464e-05, + "loss": 2.7294, + "step": 7415000 + }, + { + "epoch": 2.305216151900859, + "grad_norm": 30.28929901123047, + "learning_rate": 1.157973080165235e-05, + "loss": 2.6509, + "step": 7415500 + }, + { + "epoch": 2.305371584181346, + "grad_norm": 9.455927848815918, + "learning_rate": 1.1577140263644235e-05, + "loss": 2.6592, + "step": 7416000 + }, + { + "epoch": 2.305527016461833, + "grad_norm": 9.640174865722656, + "learning_rate": 1.157454972563612e-05, + "loss": 2.6823, + "step": 7416500 + }, + { + "epoch": 2.3056824487423198, + "grad_norm": 9.656966209411621, + "learning_rate": 1.1571959187628004e-05, + "loss": 2.6498, + "step": 7417000 + }, + { + "epoch": 2.3058378810228066, + "grad_norm": 9.052373886108398, + "learning_rate": 1.1569368649619892e-05, + "loss": 2.6315, + "step": 7417500 + }, + { + "epoch": 2.3059933133032935, + "grad_norm": 10.627616882324219, + "learning_rate": 1.1566778111611775e-05, + "loss": 2.5957, + "step": 7418000 + }, + { + "epoch": 2.3061487455837804, + "grad_norm": 10.331043243408203, + "learning_rate": 1.1564187573603662e-05, + "loss": 2.7052, + "step": 7418500 + }, + { + "epoch": 2.3063041778642672, + "grad_norm": 22.443973541259766, + "learning_rate": 1.1561597035595548e-05, + "loss": 2.6223, + "step": 7419000 + }, + { + "epoch": 2.306459610144754, + "grad_norm": 10.020153999328613, + "learning_rate": 1.1559006497587432e-05, + "loss": 2.6065, + "step": 7419500 + }, + { + "epoch": 2.306615042425241, + "grad_norm": 9.296684265136719, + "learning_rate": 1.1556415959579319e-05, + "loss": 2.6065, + "step": 7420000 + }, + { + "epoch": 2.306770474705728, + "grad_norm": 10.162707328796387, + "learning_rate": 1.1553825421571203e-05, + "loss": 2.6723, + "step": 7420500 + }, + { + "epoch": 2.3069259069862147, + "grad_norm": 11.115857124328613, + "learning_rate": 1.155123488356309e-05, + "loss": 2.6443, + "step": 7421000 + }, + { + "epoch": 2.3070813392667016, + "grad_norm": 13.589570999145508, + "learning_rate": 1.1548644345554974e-05, + "loss": 2.6566, + "step": 7421500 + }, + { + "epoch": 2.3072367715471884, + "grad_norm": 12.503231048583984, + "learning_rate": 1.1546053807546859e-05, + "loss": 2.6504, + "step": 7422000 + }, + { + "epoch": 2.3073922038276753, + "grad_norm": 12.415935516357422, + "learning_rate": 1.1543463269538744e-05, + "loss": 2.6401, + "step": 7422500 + }, + { + "epoch": 2.307547636108162, + "grad_norm": 10.601112365722656, + "learning_rate": 1.154087273153063e-05, + "loss": 2.6397, + "step": 7423000 + }, + { + "epoch": 2.307703068388649, + "grad_norm": 8.3087739944458, + "learning_rate": 1.1538282193522517e-05, + "loss": 2.644, + "step": 7423500 + }, + { + "epoch": 2.307858500669136, + "grad_norm": 7.41399621963501, + "learning_rate": 1.15356916555144e-05, + "loss": 2.6535, + "step": 7424000 + }, + { + "epoch": 2.3080139329496228, + "grad_norm": 10.653406143188477, + "learning_rate": 1.1533101117506286e-05, + "loss": 2.6079, + "step": 7424500 + }, + { + "epoch": 2.3081693652301096, + "grad_norm": 13.288948059082031, + "learning_rate": 1.1530510579498172e-05, + "loss": 2.6349, + "step": 7425000 + }, + { + "epoch": 2.3083247975105965, + "grad_norm": 8.548242568969727, + "learning_rate": 1.1527920041490057e-05, + "loss": 2.6181, + "step": 7425500 + }, + { + "epoch": 2.3084802297910834, + "grad_norm": 8.982117652893066, + "learning_rate": 1.1525329503481943e-05, + "loss": 2.6245, + "step": 7426000 + }, + { + "epoch": 2.3086356620715702, + "grad_norm": 13.444840431213379, + "learning_rate": 1.1522738965473828e-05, + "loss": 2.6496, + "step": 7426500 + }, + { + "epoch": 2.308791094352057, + "grad_norm": 9.278909683227539, + "learning_rate": 1.1520148427465714e-05, + "loss": 2.6339, + "step": 7427000 + }, + { + "epoch": 2.308946526632544, + "grad_norm": 10.730112075805664, + "learning_rate": 1.1517557889457599e-05, + "loss": 2.6384, + "step": 7427500 + }, + { + "epoch": 2.309101958913031, + "grad_norm": 40.16972732543945, + "learning_rate": 1.1514967351449485e-05, + "loss": 2.6616, + "step": 7428000 + }, + { + "epoch": 2.3092573911935177, + "grad_norm": 10.393341064453125, + "learning_rate": 1.151237681344137e-05, + "loss": 2.6289, + "step": 7428500 + }, + { + "epoch": 2.3094128234740046, + "grad_norm": 11.521309852600098, + "learning_rate": 1.1509786275433255e-05, + "loss": 2.6319, + "step": 7429000 + }, + { + "epoch": 2.3095682557544914, + "grad_norm": 11.416764259338379, + "learning_rate": 1.1507195737425141e-05, + "loss": 2.6122, + "step": 7429500 + }, + { + "epoch": 2.3097236880349783, + "grad_norm": 24.8316650390625, + "learning_rate": 1.1504605199417026e-05, + "loss": 2.615, + "step": 7430000 + }, + { + "epoch": 2.3098791203154656, + "grad_norm": 10.860010147094727, + "learning_rate": 1.150201466140891e-05, + "loss": 2.6275, + "step": 7430500 + }, + { + "epoch": 2.310034552595952, + "grad_norm": 12.6547269821167, + "learning_rate": 1.1499424123400797e-05, + "loss": 2.6897, + "step": 7431000 + }, + { + "epoch": 2.3101899848764393, + "grad_norm": 9.366609573364258, + "learning_rate": 1.1496833585392681e-05, + "loss": 2.6228, + "step": 7431500 + }, + { + "epoch": 2.3103454171569258, + "grad_norm": 9.87781810760498, + "learning_rate": 1.1494243047384568e-05, + "loss": 2.6128, + "step": 7432000 + }, + { + "epoch": 2.310500849437413, + "grad_norm": 20.177690505981445, + "learning_rate": 1.1491652509376454e-05, + "loss": 2.6188, + "step": 7432500 + }, + { + "epoch": 2.3106562817178995, + "grad_norm": 9.212550163269043, + "learning_rate": 1.1489061971368337e-05, + "loss": 2.6476, + "step": 7433000 + }, + { + "epoch": 2.310811713998387, + "grad_norm": 24.886932373046875, + "learning_rate": 1.1486471433360225e-05, + "loss": 2.6664, + "step": 7433500 + }, + { + "epoch": 2.3109671462788737, + "grad_norm": 19.94777488708496, + "learning_rate": 1.1483880895352108e-05, + "loss": 2.6271, + "step": 7434000 + }, + { + "epoch": 2.3111225785593605, + "grad_norm": 9.848361015319824, + "learning_rate": 1.1481290357343995e-05, + "loss": 2.6203, + "step": 7434500 + }, + { + "epoch": 2.3112780108398474, + "grad_norm": 9.015551567077637, + "learning_rate": 1.147869981933588e-05, + "loss": 2.5999, + "step": 7435000 + }, + { + "epoch": 2.3114334431203343, + "grad_norm": 8.167235374450684, + "learning_rate": 1.1476109281327765e-05, + "loss": 2.6168, + "step": 7435500 + }, + { + "epoch": 2.311588875400821, + "grad_norm": 8.901158332824707, + "learning_rate": 1.147351874331965e-05, + "loss": 2.6634, + "step": 7436000 + }, + { + "epoch": 2.311744307681308, + "grad_norm": 13.867891311645508, + "learning_rate": 1.1470928205311536e-05, + "loss": 2.6663, + "step": 7436500 + }, + { + "epoch": 2.311899739961795, + "grad_norm": 9.149711608886719, + "learning_rate": 1.1468337667303423e-05, + "loss": 2.6533, + "step": 7437000 + }, + { + "epoch": 2.3120551722422817, + "grad_norm": 15.846681594848633, + "learning_rate": 1.1465747129295307e-05, + "loss": 2.6211, + "step": 7437500 + }, + { + "epoch": 2.3122106045227686, + "grad_norm": 11.35682201385498, + "learning_rate": 1.1463156591287192e-05, + "loss": 2.6537, + "step": 7438000 + }, + { + "epoch": 2.3123660368032555, + "grad_norm": 12.655189514160156, + "learning_rate": 1.1460566053279077e-05, + "loss": 2.6733, + "step": 7438500 + }, + { + "epoch": 2.3125214690837423, + "grad_norm": 10.226794242858887, + "learning_rate": 1.1457975515270963e-05, + "loss": 2.6374, + "step": 7439000 + }, + { + "epoch": 2.312676901364229, + "grad_norm": 9.849506378173828, + "learning_rate": 1.1455384977262848e-05, + "loss": 2.6362, + "step": 7439500 + }, + { + "epoch": 2.312832333644716, + "grad_norm": 9.253920555114746, + "learning_rate": 1.1452794439254734e-05, + "loss": 2.5995, + "step": 7440000 + }, + { + "epoch": 2.312987765925203, + "grad_norm": 15.581098556518555, + "learning_rate": 1.145020390124662e-05, + "loss": 2.6359, + "step": 7440500 + }, + { + "epoch": 2.31314319820569, + "grad_norm": 6.364349365234375, + "learning_rate": 1.1447613363238505e-05, + "loss": 2.6477, + "step": 7441000 + }, + { + "epoch": 2.3132986304861767, + "grad_norm": 9.745960235595703, + "learning_rate": 1.144502282523039e-05, + "loss": 2.608, + "step": 7441500 + }, + { + "epoch": 2.3134540627666635, + "grad_norm": 23.941818237304688, + "learning_rate": 1.1442432287222276e-05, + "loss": 2.6558, + "step": 7442000 + }, + { + "epoch": 2.3136094950471504, + "grad_norm": 25.5565185546875, + "learning_rate": 1.1439841749214161e-05, + "loss": 2.6349, + "step": 7442500 + }, + { + "epoch": 2.3137649273276373, + "grad_norm": 10.166278839111328, + "learning_rate": 1.1437251211206047e-05, + "loss": 2.6105, + "step": 7443000 + }, + { + "epoch": 2.313920359608124, + "grad_norm": 9.327764511108398, + "learning_rate": 1.1434660673197932e-05, + "loss": 2.625, + "step": 7443500 + }, + { + "epoch": 2.314075791888611, + "grad_norm": 9.640209197998047, + "learning_rate": 1.1432070135189816e-05, + "loss": 2.6345, + "step": 7444000 + }, + { + "epoch": 2.314231224169098, + "grad_norm": 9.534721374511719, + "learning_rate": 1.1429479597181703e-05, + "loss": 2.6713, + "step": 7444500 + }, + { + "epoch": 2.3143866564495847, + "grad_norm": 8.475646018981934, + "learning_rate": 1.1426889059173587e-05, + "loss": 2.6588, + "step": 7445000 + }, + { + "epoch": 2.3145420887300716, + "grad_norm": 9.210695266723633, + "learning_rate": 1.1424298521165474e-05, + "loss": 2.6454, + "step": 7445500 + }, + { + "epoch": 2.3146975210105585, + "grad_norm": 10.395577430725098, + "learning_rate": 1.142170798315736e-05, + "loss": 2.5973, + "step": 7446000 + }, + { + "epoch": 2.3148529532910453, + "grad_norm": 11.990683555603027, + "learning_rate": 1.1419117445149243e-05, + "loss": 2.617, + "step": 7446500 + }, + { + "epoch": 2.315008385571532, + "grad_norm": 10.311555862426758, + "learning_rate": 1.141652690714113e-05, + "loss": 2.684, + "step": 7447000 + }, + { + "epoch": 2.315163817852019, + "grad_norm": 12.143085479736328, + "learning_rate": 1.1413936369133014e-05, + "loss": 2.6236, + "step": 7447500 + }, + { + "epoch": 2.315319250132506, + "grad_norm": 8.85138988494873, + "learning_rate": 1.1411345831124901e-05, + "loss": 2.6723, + "step": 7448000 + }, + { + "epoch": 2.315474682412993, + "grad_norm": 8.6909761428833, + "learning_rate": 1.1408755293116785e-05, + "loss": 2.6242, + "step": 7448500 + }, + { + "epoch": 2.3156301146934797, + "grad_norm": 9.192151069641113, + "learning_rate": 1.140616475510867e-05, + "loss": 2.671, + "step": 7449000 + }, + { + "epoch": 2.3157855469739665, + "grad_norm": 9.941723823547363, + "learning_rate": 1.1403574217100556e-05, + "loss": 2.6436, + "step": 7449500 + }, + { + "epoch": 2.3159409792544534, + "grad_norm": 9.636931419372559, + "learning_rate": 1.1400983679092441e-05, + "loss": 2.6408, + "step": 7450000 + }, + { + "epoch": 2.3160964115349403, + "grad_norm": 8.999906539916992, + "learning_rate": 1.1398393141084328e-05, + "loss": 2.6035, + "step": 7450500 + }, + { + "epoch": 2.316251843815427, + "grad_norm": 11.529014587402344, + "learning_rate": 1.1395802603076212e-05, + "loss": 2.6774, + "step": 7451000 + }, + { + "epoch": 2.316407276095914, + "grad_norm": 11.337753295898438, + "learning_rate": 1.1393212065068098e-05, + "loss": 2.6427, + "step": 7451500 + }, + { + "epoch": 2.316562708376401, + "grad_norm": 15.342877388000488, + "learning_rate": 1.1390621527059983e-05, + "loss": 2.7412, + "step": 7452000 + }, + { + "epoch": 2.3167181406568877, + "grad_norm": 9.745675086975098, + "learning_rate": 1.1388030989051869e-05, + "loss": 2.6727, + "step": 7452500 + }, + { + "epoch": 2.3168735729373746, + "grad_norm": 8.006047248840332, + "learning_rate": 1.1385440451043754e-05, + "loss": 2.6622, + "step": 7453000 + }, + { + "epoch": 2.3170290052178615, + "grad_norm": 8.34312915802002, + "learning_rate": 1.138284991303564e-05, + "loss": 2.6193, + "step": 7453500 + }, + { + "epoch": 2.3171844374983483, + "grad_norm": 10.568881034851074, + "learning_rate": 1.1380259375027525e-05, + "loss": 2.6536, + "step": 7454000 + }, + { + "epoch": 2.317339869778835, + "grad_norm": 8.564251899719238, + "learning_rate": 1.137766883701941e-05, + "loss": 2.6253, + "step": 7454500 + }, + { + "epoch": 2.3174953020593225, + "grad_norm": 8.254532814025879, + "learning_rate": 1.1375078299011296e-05, + "loss": 2.6284, + "step": 7455000 + }, + { + "epoch": 2.317650734339809, + "grad_norm": 10.167815208435059, + "learning_rate": 1.1372487761003181e-05, + "loss": 2.6261, + "step": 7455500 + }, + { + "epoch": 2.3178061666202963, + "grad_norm": 39.467926025390625, + "learning_rate": 1.1369897222995067e-05, + "loss": 2.6451, + "step": 7456000 + }, + { + "epoch": 2.3179615989007827, + "grad_norm": 13.723750114440918, + "learning_rate": 1.1367306684986952e-05, + "loss": 2.6187, + "step": 7456500 + }, + { + "epoch": 2.31811703118127, + "grad_norm": 9.122394561767578, + "learning_rate": 1.1364716146978838e-05, + "loss": 2.6283, + "step": 7457000 + }, + { + "epoch": 2.3182724634617564, + "grad_norm": 9.950982093811035, + "learning_rate": 1.1362125608970722e-05, + "loss": 2.659, + "step": 7457500 + }, + { + "epoch": 2.3184278957422437, + "grad_norm": 7.945058822631836, + "learning_rate": 1.1359535070962609e-05, + "loss": 2.6044, + "step": 7458000 + }, + { + "epoch": 2.3185833280227306, + "grad_norm": 10.22750473022461, + "learning_rate": 1.1356944532954492e-05, + "loss": 2.6766, + "step": 7458500 + }, + { + "epoch": 2.3187387603032175, + "grad_norm": 10.260638236999512, + "learning_rate": 1.135435399494638e-05, + "loss": 2.6096, + "step": 7459000 + }, + { + "epoch": 2.3188941925837043, + "grad_norm": 10.433106422424316, + "learning_rate": 1.1351763456938265e-05, + "loss": 2.6393, + "step": 7459500 + }, + { + "epoch": 2.319049624864191, + "grad_norm": 9.418624877929688, + "learning_rate": 1.1349172918930149e-05, + "loss": 2.62, + "step": 7460000 + }, + { + "epoch": 2.319205057144678, + "grad_norm": 13.413582801818848, + "learning_rate": 1.1346582380922036e-05, + "loss": 2.6729, + "step": 7460500 + }, + { + "epoch": 2.319360489425165, + "grad_norm": 22.712154388427734, + "learning_rate": 1.134399184291392e-05, + "loss": 2.6397, + "step": 7461000 + }, + { + "epoch": 2.319515921705652, + "grad_norm": 9.62330436706543, + "learning_rate": 1.1341401304905807e-05, + "loss": 2.6395, + "step": 7461500 + }, + { + "epoch": 2.3196713539861387, + "grad_norm": 8.513716697692871, + "learning_rate": 1.133881076689769e-05, + "loss": 2.6292, + "step": 7462000 + }, + { + "epoch": 2.3198267862666255, + "grad_norm": 17.078655242919922, + "learning_rate": 1.1336220228889576e-05, + "loss": 2.6092, + "step": 7462500 + }, + { + "epoch": 2.3199822185471124, + "grad_norm": 14.871017456054688, + "learning_rate": 1.1333629690881462e-05, + "loss": 2.6035, + "step": 7463000 + }, + { + "epoch": 2.3201376508275993, + "grad_norm": 13.09859848022461, + "learning_rate": 1.1331039152873347e-05, + "loss": 2.6319, + "step": 7463500 + }, + { + "epoch": 2.320293083108086, + "grad_norm": 14.372514724731445, + "learning_rate": 1.1328448614865234e-05, + "loss": 2.5916, + "step": 7464000 + }, + { + "epoch": 2.320448515388573, + "grad_norm": 10.366181373596191, + "learning_rate": 1.1325858076857118e-05, + "loss": 2.5984, + "step": 7464500 + }, + { + "epoch": 2.32060394766906, + "grad_norm": 8.471647262573242, + "learning_rate": 1.1323267538849003e-05, + "loss": 2.6419, + "step": 7465000 + }, + { + "epoch": 2.3207593799495467, + "grad_norm": 12.256261825561523, + "learning_rate": 1.1320677000840889e-05, + "loss": 2.6433, + "step": 7465500 + }, + { + "epoch": 2.3209148122300336, + "grad_norm": 10.702787399291992, + "learning_rate": 1.1318086462832774e-05, + "loss": 2.632, + "step": 7466000 + }, + { + "epoch": 2.3210702445105205, + "grad_norm": 9.362414360046387, + "learning_rate": 1.131549592482466e-05, + "loss": 2.6513, + "step": 7466500 + }, + { + "epoch": 2.3212256767910073, + "grad_norm": 11.165071487426758, + "learning_rate": 1.1312905386816545e-05, + "loss": 2.6191, + "step": 7467000 + }, + { + "epoch": 2.321381109071494, + "grad_norm": 9.98103141784668, + "learning_rate": 1.131031484880843e-05, + "loss": 2.6407, + "step": 7467500 + }, + { + "epoch": 2.321536541351981, + "grad_norm": 11.697531700134277, + "learning_rate": 1.1307724310800316e-05, + "loss": 2.6721, + "step": 7468000 + }, + { + "epoch": 2.321691973632468, + "grad_norm": 10.670780181884766, + "learning_rate": 1.1305133772792202e-05, + "loss": 2.6595, + "step": 7468500 + }, + { + "epoch": 2.321847405912955, + "grad_norm": 10.27443790435791, + "learning_rate": 1.1302543234784087e-05, + "loss": 2.6647, + "step": 7469000 + }, + { + "epoch": 2.3220028381934417, + "grad_norm": 15.711913108825684, + "learning_rate": 1.1299952696775973e-05, + "loss": 2.6368, + "step": 7469500 + }, + { + "epoch": 2.3221582704739285, + "grad_norm": 9.274199485778809, + "learning_rate": 1.1297362158767858e-05, + "loss": 2.6363, + "step": 7470000 + }, + { + "epoch": 2.3223137027544154, + "grad_norm": 8.280325889587402, + "learning_rate": 1.1294771620759743e-05, + "loss": 2.6081, + "step": 7470500 + }, + { + "epoch": 2.3224691350349023, + "grad_norm": 9.568648338317871, + "learning_rate": 1.1292181082751627e-05, + "loss": 2.637, + "step": 7471000 + }, + { + "epoch": 2.322624567315389, + "grad_norm": 10.182262420654297, + "learning_rate": 1.1289590544743514e-05, + "loss": 2.648, + "step": 7471500 + }, + { + "epoch": 2.322779999595876, + "grad_norm": 16.091699600219727, + "learning_rate": 1.1287000006735398e-05, + "loss": 2.6561, + "step": 7472000 + }, + { + "epoch": 2.322935431876363, + "grad_norm": 9.995500564575195, + "learning_rate": 1.1284409468727285e-05, + "loss": 2.6674, + "step": 7472500 + }, + { + "epoch": 2.3230908641568497, + "grad_norm": 14.177772521972656, + "learning_rate": 1.128181893071917e-05, + "loss": 2.6424, + "step": 7473000 + }, + { + "epoch": 2.3232462964373366, + "grad_norm": 17.47187042236328, + "learning_rate": 1.1279228392711055e-05, + "loss": 2.6576, + "step": 7473500 + }, + { + "epoch": 2.3234017287178235, + "grad_norm": 10.802764892578125, + "learning_rate": 1.1276637854702942e-05, + "loss": 2.6769, + "step": 7474000 + }, + { + "epoch": 2.3235571609983103, + "grad_norm": 8.131357192993164, + "learning_rate": 1.1274047316694825e-05, + "loss": 2.6255, + "step": 7474500 + }, + { + "epoch": 2.323712593278797, + "grad_norm": 29.259349822998047, + "learning_rate": 1.1271456778686713e-05, + "loss": 2.6707, + "step": 7475000 + }, + { + "epoch": 2.323868025559284, + "grad_norm": 10.96568489074707, + "learning_rate": 1.1268866240678596e-05, + "loss": 2.6521, + "step": 7475500 + }, + { + "epoch": 2.324023457839771, + "grad_norm": 19.651458740234375, + "learning_rate": 1.1266275702670482e-05, + "loss": 2.6721, + "step": 7476000 + }, + { + "epoch": 2.324178890120258, + "grad_norm": 7.950291633605957, + "learning_rate": 1.1263685164662367e-05, + "loss": 2.6485, + "step": 7476500 + }, + { + "epoch": 2.3243343224007447, + "grad_norm": 11.697196006774902, + "learning_rate": 1.1261094626654253e-05, + "loss": 2.6772, + "step": 7477000 + }, + { + "epoch": 2.3244897546812315, + "grad_norm": 9.545422554016113, + "learning_rate": 1.125850408864614e-05, + "loss": 2.6359, + "step": 7477500 + }, + { + "epoch": 2.3246451869617184, + "grad_norm": 13.632344245910645, + "learning_rate": 1.1255913550638024e-05, + "loss": 2.5834, + "step": 7478000 + }, + { + "epoch": 2.3248006192422053, + "grad_norm": 9.998568534851074, + "learning_rate": 1.1253323012629909e-05, + "loss": 2.6303, + "step": 7478500 + }, + { + "epoch": 2.324956051522692, + "grad_norm": 9.618268013000488, + "learning_rate": 1.1250732474621795e-05, + "loss": 2.6753, + "step": 7479000 + }, + { + "epoch": 2.3251114838031794, + "grad_norm": 13.653499603271484, + "learning_rate": 1.124814193661368e-05, + "loss": 2.6155, + "step": 7479500 + }, + { + "epoch": 2.325266916083666, + "grad_norm": 9.691633224487305, + "learning_rate": 1.1245551398605565e-05, + "loss": 2.6667, + "step": 7480000 + }, + { + "epoch": 2.325422348364153, + "grad_norm": 9.5654296875, + "learning_rate": 1.1242960860597451e-05, + "loss": 2.6391, + "step": 7480500 + }, + { + "epoch": 2.3255777806446396, + "grad_norm": 9.81852912902832, + "learning_rate": 1.1240370322589336e-05, + "loss": 2.6305, + "step": 7481000 + }, + { + "epoch": 2.325733212925127, + "grad_norm": 39.90187072753906, + "learning_rate": 1.1237779784581222e-05, + "loss": 2.6437, + "step": 7481500 + }, + { + "epoch": 2.325888645205614, + "grad_norm": 10.955416679382324, + "learning_rate": 1.1235189246573107e-05, + "loss": 2.6852, + "step": 7482000 + }, + { + "epoch": 2.3260440774861006, + "grad_norm": 27.825321197509766, + "learning_rate": 1.1232598708564993e-05, + "loss": 2.6241, + "step": 7482500 + }, + { + "epoch": 2.3261995097665875, + "grad_norm": 34.79759979248047, + "learning_rate": 1.1230008170556878e-05, + "loss": 2.6311, + "step": 7483000 + }, + { + "epoch": 2.3263549420470744, + "grad_norm": 6.435932636260986, + "learning_rate": 1.1227417632548764e-05, + "loss": 2.69, + "step": 7483500 + }, + { + "epoch": 2.3265103743275612, + "grad_norm": 9.575766563415527, + "learning_rate": 1.122482709454065e-05, + "loss": 2.6506, + "step": 7484000 + }, + { + "epoch": 2.326665806608048, + "grad_norm": 7.406370162963867, + "learning_rate": 1.1222236556532535e-05, + "loss": 2.6445, + "step": 7484500 + }, + { + "epoch": 2.326821238888535, + "grad_norm": 17.27906608581543, + "learning_rate": 1.121964601852442e-05, + "loss": 2.6357, + "step": 7485000 + }, + { + "epoch": 2.326976671169022, + "grad_norm": 9.584341049194336, + "learning_rate": 1.1217055480516304e-05, + "loss": 2.6179, + "step": 7485500 + }, + { + "epoch": 2.3271321034495087, + "grad_norm": 8.690574645996094, + "learning_rate": 1.1214464942508191e-05, + "loss": 2.6396, + "step": 7486000 + }, + { + "epoch": 2.3272875357299956, + "grad_norm": 20.028766632080078, + "learning_rate": 1.1211874404500076e-05, + "loss": 2.6363, + "step": 7486500 + }, + { + "epoch": 2.3274429680104824, + "grad_norm": 9.905426979064941, + "learning_rate": 1.1209283866491962e-05, + "loss": 2.613, + "step": 7487000 + }, + { + "epoch": 2.3275984002909693, + "grad_norm": 30.784168243408203, + "learning_rate": 1.1206693328483847e-05, + "loss": 2.6321, + "step": 7487500 + }, + { + "epoch": 2.327753832571456, + "grad_norm": 9.022817611694336, + "learning_rate": 1.1204102790475731e-05, + "loss": 2.6595, + "step": 7488000 + }, + { + "epoch": 2.327909264851943, + "grad_norm": 38.93898010253906, + "learning_rate": 1.1201512252467618e-05, + "loss": 2.6385, + "step": 7488500 + }, + { + "epoch": 2.32806469713243, + "grad_norm": 9.582201957702637, + "learning_rate": 1.1198921714459502e-05, + "loss": 2.6699, + "step": 7489000 + }, + { + "epoch": 2.328220129412917, + "grad_norm": 8.981911659240723, + "learning_rate": 1.119633117645139e-05, + "loss": 2.639, + "step": 7489500 + }, + { + "epoch": 2.3283755616934037, + "grad_norm": 9.868436813354492, + "learning_rate": 1.1193740638443273e-05, + "loss": 2.615, + "step": 7490000 + }, + { + "epoch": 2.3285309939738905, + "grad_norm": 9.569181442260742, + "learning_rate": 1.1191150100435158e-05, + "loss": 2.6067, + "step": 7490500 + }, + { + "epoch": 2.3286864262543774, + "grad_norm": 8.368475914001465, + "learning_rate": 1.1188559562427046e-05, + "loss": 2.6642, + "step": 7491000 + }, + { + "epoch": 2.3288418585348643, + "grad_norm": 7.743016242980957, + "learning_rate": 1.118596902441893e-05, + "loss": 2.6585, + "step": 7491500 + }, + { + "epoch": 2.328997290815351, + "grad_norm": 8.57955551147461, + "learning_rate": 1.1183378486410817e-05, + "loss": 2.6444, + "step": 7492000 + }, + { + "epoch": 2.329152723095838, + "grad_norm": 9.65494441986084, + "learning_rate": 1.11807879484027e-05, + "loss": 2.6152, + "step": 7492500 + }, + { + "epoch": 2.329308155376325, + "grad_norm": 9.897436141967773, + "learning_rate": 1.1178197410394586e-05, + "loss": 2.6406, + "step": 7493000 + }, + { + "epoch": 2.3294635876568117, + "grad_norm": 16.05580711364746, + "learning_rate": 1.1175606872386471e-05, + "loss": 2.6591, + "step": 7493500 + }, + { + "epoch": 2.3296190199372986, + "grad_norm": 9.138599395751953, + "learning_rate": 1.1173016334378357e-05, + "loss": 2.6433, + "step": 7494000 + }, + { + "epoch": 2.3297744522177855, + "grad_norm": 9.56319522857666, + "learning_rate": 1.1170425796370242e-05, + "loss": 2.6744, + "step": 7494500 + }, + { + "epoch": 2.3299298844982723, + "grad_norm": 8.217362403869629, + "learning_rate": 1.1167835258362128e-05, + "loss": 2.6426, + "step": 7495000 + }, + { + "epoch": 2.330085316778759, + "grad_norm": 18.832651138305664, + "learning_rate": 1.1165244720354013e-05, + "loss": 2.6379, + "step": 7495500 + }, + { + "epoch": 2.330240749059246, + "grad_norm": 9.00090217590332, + "learning_rate": 1.1162654182345899e-05, + "loss": 2.6994, + "step": 7496000 + }, + { + "epoch": 2.330396181339733, + "grad_norm": 8.840618133544922, + "learning_rate": 1.1160063644337784e-05, + "loss": 2.6023, + "step": 7496500 + }, + { + "epoch": 2.33055161362022, + "grad_norm": 13.288460731506348, + "learning_rate": 1.115747310632967e-05, + "loss": 2.6164, + "step": 7497000 + }, + { + "epoch": 2.3307070459007067, + "grad_norm": 9.984969139099121, + "learning_rate": 1.1154882568321555e-05, + "loss": 2.667, + "step": 7497500 + }, + { + "epoch": 2.3308624781811935, + "grad_norm": 10.426153182983398, + "learning_rate": 1.115229203031344e-05, + "loss": 2.663, + "step": 7498000 + }, + { + "epoch": 2.3310179104616804, + "grad_norm": 16.658737182617188, + "learning_rate": 1.1149701492305326e-05, + "loss": 2.6453, + "step": 7498500 + }, + { + "epoch": 2.3311733427421673, + "grad_norm": 9.173316955566406, + "learning_rate": 1.114711095429721e-05, + "loss": 2.6345, + "step": 7499000 + }, + { + "epoch": 2.331328775022654, + "grad_norm": 26.50439453125, + "learning_rate": 1.1144520416289097e-05, + "loss": 2.6496, + "step": 7499500 + }, + { + "epoch": 2.331484207303141, + "grad_norm": 9.58687973022461, + "learning_rate": 1.1141929878280982e-05, + "loss": 2.6247, + "step": 7500000 + }, + { + "epoch": 2.331639639583628, + "grad_norm": 10.383517265319824, + "learning_rate": 1.1139339340272868e-05, + "loss": 2.6147, + "step": 7500500 + }, + { + "epoch": 2.3317950718641147, + "grad_norm": 10.240723609924316, + "learning_rate": 1.1136748802264753e-05, + "loss": 2.6479, + "step": 7501000 + }, + { + "epoch": 2.3319505041446016, + "grad_norm": 8.975716590881348, + "learning_rate": 1.1134158264256637e-05, + "loss": 2.6221, + "step": 7501500 + }, + { + "epoch": 2.3321059364250885, + "grad_norm": 9.453174591064453, + "learning_rate": 1.1131567726248524e-05, + "loss": 2.6621, + "step": 7502000 + }, + { + "epoch": 2.3322613687055753, + "grad_norm": 6.70010232925415, + "learning_rate": 1.1128977188240408e-05, + "loss": 2.5836, + "step": 7502500 + }, + { + "epoch": 2.3324168009860626, + "grad_norm": 10.870018005371094, + "learning_rate": 1.1126386650232295e-05, + "loss": 2.6124, + "step": 7503000 + }, + { + "epoch": 2.332572233266549, + "grad_norm": 11.532368659973145, + "learning_rate": 1.112379611222418e-05, + "loss": 2.6306, + "step": 7503500 + }, + { + "epoch": 2.3327276655470364, + "grad_norm": 13.216182708740234, + "learning_rate": 1.1121205574216064e-05, + "loss": 2.65, + "step": 7504000 + }, + { + "epoch": 2.332883097827523, + "grad_norm": 9.061773300170898, + "learning_rate": 1.1118615036207951e-05, + "loss": 2.5694, + "step": 7504500 + }, + { + "epoch": 2.33303853010801, + "grad_norm": 9.102398872375488, + "learning_rate": 1.1116024498199835e-05, + "loss": 2.6361, + "step": 7505000 + }, + { + "epoch": 2.3331939623884965, + "grad_norm": 9.578680992126465, + "learning_rate": 1.1113433960191722e-05, + "loss": 2.6582, + "step": 7505500 + }, + { + "epoch": 2.333349394668984, + "grad_norm": 19.067113876342773, + "learning_rate": 1.1110843422183606e-05, + "loss": 2.5703, + "step": 7506000 + }, + { + "epoch": 2.3335048269494707, + "grad_norm": 7.516259670257568, + "learning_rate": 1.1108252884175491e-05, + "loss": 2.7061, + "step": 7506500 + }, + { + "epoch": 2.3336602592299576, + "grad_norm": 7.3904595375061035, + "learning_rate": 1.1105662346167377e-05, + "loss": 2.6268, + "step": 7507000 + }, + { + "epoch": 2.3338156915104444, + "grad_norm": 12.610342979431152, + "learning_rate": 1.1103071808159262e-05, + "loss": 2.6546, + "step": 7507500 + }, + { + "epoch": 2.3339711237909313, + "grad_norm": 86.89161682128906, + "learning_rate": 1.110048127015115e-05, + "loss": 2.6698, + "step": 7508000 + }, + { + "epoch": 2.334126556071418, + "grad_norm": 8.835942268371582, + "learning_rate": 1.1097890732143033e-05, + "loss": 2.6545, + "step": 7508500 + }, + { + "epoch": 2.334281988351905, + "grad_norm": 11.87960147857666, + "learning_rate": 1.1095300194134919e-05, + "loss": 2.5913, + "step": 7509000 + }, + { + "epoch": 2.334437420632392, + "grad_norm": 10.042864799499512, + "learning_rate": 1.1092709656126804e-05, + "loss": 2.6588, + "step": 7509500 + }, + { + "epoch": 2.3345928529128788, + "grad_norm": 8.888025283813477, + "learning_rate": 1.109011911811869e-05, + "loss": 2.6355, + "step": 7510000 + }, + { + "epoch": 2.3347482851933656, + "grad_norm": 15.20809268951416, + "learning_rate": 1.1087528580110575e-05, + "loss": 2.621, + "step": 7510500 + }, + { + "epoch": 2.3349037174738525, + "grad_norm": 10.065488815307617, + "learning_rate": 1.108493804210246e-05, + "loss": 2.6271, + "step": 7511000 + }, + { + "epoch": 2.3350591497543394, + "grad_norm": 23.01068115234375, + "learning_rate": 1.1082347504094346e-05, + "loss": 2.6419, + "step": 7511500 + }, + { + "epoch": 2.3352145820348262, + "grad_norm": 9.407057762145996, + "learning_rate": 1.1079756966086232e-05, + "loss": 2.6411, + "step": 7512000 + }, + { + "epoch": 2.335370014315313, + "grad_norm": 12.484465599060059, + "learning_rate": 1.1077166428078117e-05, + "loss": 2.6063, + "step": 7512500 + }, + { + "epoch": 2.3355254465958, + "grad_norm": 10.29742431640625, + "learning_rate": 1.1074575890070002e-05, + "loss": 2.6495, + "step": 7513000 + }, + { + "epoch": 2.335680878876287, + "grad_norm": 10.897299766540527, + "learning_rate": 1.1071985352061888e-05, + "loss": 2.6161, + "step": 7513500 + }, + { + "epoch": 2.3358363111567737, + "grad_norm": 8.524006843566895, + "learning_rate": 1.1069394814053773e-05, + "loss": 2.5825, + "step": 7514000 + }, + { + "epoch": 2.3359917434372606, + "grad_norm": 11.831510543823242, + "learning_rate": 1.1066804276045659e-05, + "loss": 2.6206, + "step": 7514500 + }, + { + "epoch": 2.3361471757177474, + "grad_norm": 9.202934265136719, + "learning_rate": 1.1064213738037543e-05, + "loss": 2.6165, + "step": 7515000 + }, + { + "epoch": 2.3363026079982343, + "grad_norm": 12.43116283416748, + "learning_rate": 1.106162320002943e-05, + "loss": 2.6585, + "step": 7515500 + }, + { + "epoch": 2.336458040278721, + "grad_norm": 10.753710746765137, + "learning_rate": 1.1059032662021313e-05, + "loss": 2.6645, + "step": 7516000 + }, + { + "epoch": 2.336613472559208, + "grad_norm": 9.959314346313477, + "learning_rate": 1.10564421240132e-05, + "loss": 2.6884, + "step": 7516500 + }, + { + "epoch": 2.336768904839695, + "grad_norm": 9.229012489318848, + "learning_rate": 1.1053851586005086e-05, + "loss": 2.6499, + "step": 7517000 + }, + { + "epoch": 2.3369243371201818, + "grad_norm": 10.20935344696045, + "learning_rate": 1.105126104799697e-05, + "loss": 2.6202, + "step": 7517500 + }, + { + "epoch": 2.3370797694006686, + "grad_norm": 8.900355339050293, + "learning_rate": 1.1048670509988857e-05, + "loss": 2.5756, + "step": 7518000 + }, + { + "epoch": 2.3372352016811555, + "grad_norm": 15.960352897644043, + "learning_rate": 1.104607997198074e-05, + "loss": 2.6543, + "step": 7518500 + }, + { + "epoch": 2.3373906339616424, + "grad_norm": 9.369285583496094, + "learning_rate": 1.1043489433972628e-05, + "loss": 2.6196, + "step": 7519000 + }, + { + "epoch": 2.3375460662421292, + "grad_norm": 12.459593772888184, + "learning_rate": 1.1040898895964512e-05, + "loss": 2.6054, + "step": 7519500 + }, + { + "epoch": 2.337701498522616, + "grad_norm": 11.281132698059082, + "learning_rate": 1.1038308357956397e-05, + "loss": 2.6564, + "step": 7520000 + }, + { + "epoch": 2.337856930803103, + "grad_norm": 8.192427635192871, + "learning_rate": 1.1035717819948283e-05, + "loss": 2.6064, + "step": 7520500 + }, + { + "epoch": 2.33801236308359, + "grad_norm": 12.933549880981445, + "learning_rate": 1.1033127281940168e-05, + "loss": 2.6856, + "step": 7521000 + }, + { + "epoch": 2.3381677953640767, + "grad_norm": 11.981822967529297, + "learning_rate": 1.1030536743932055e-05, + "loss": 2.6244, + "step": 7521500 + }, + { + "epoch": 2.3383232276445636, + "grad_norm": 10.169722557067871, + "learning_rate": 1.1027946205923939e-05, + "loss": 2.6138, + "step": 7522000 + }, + { + "epoch": 2.3384786599250504, + "grad_norm": 8.492910385131836, + "learning_rate": 1.1025355667915824e-05, + "loss": 2.5861, + "step": 7522500 + }, + { + "epoch": 2.3386340922055373, + "grad_norm": 6.415866851806641, + "learning_rate": 1.102276512990771e-05, + "loss": 2.7021, + "step": 7523000 + }, + { + "epoch": 2.338789524486024, + "grad_norm": 8.91253662109375, + "learning_rate": 1.1020174591899595e-05, + "loss": 2.6426, + "step": 7523500 + }, + { + "epoch": 2.338944956766511, + "grad_norm": 9.860477447509766, + "learning_rate": 1.101758405389148e-05, + "loss": 2.6175, + "step": 7524000 + }, + { + "epoch": 2.339100389046998, + "grad_norm": 9.065752029418945, + "learning_rate": 1.1014993515883366e-05, + "loss": 2.6458, + "step": 7524500 + }, + { + "epoch": 2.3392558213274848, + "grad_norm": 9.275657653808594, + "learning_rate": 1.1012402977875252e-05, + "loss": 2.5849, + "step": 7525000 + }, + { + "epoch": 2.3394112536079716, + "grad_norm": 11.629531860351562, + "learning_rate": 1.1009812439867137e-05, + "loss": 2.6085, + "step": 7525500 + }, + { + "epoch": 2.3395666858884585, + "grad_norm": 7.788503646850586, + "learning_rate": 1.1007221901859023e-05, + "loss": 2.622, + "step": 7526000 + }, + { + "epoch": 2.3397221181689454, + "grad_norm": 10.37186050415039, + "learning_rate": 1.1004631363850908e-05, + "loss": 2.6324, + "step": 7526500 + }, + { + "epoch": 2.3398775504494322, + "grad_norm": 9.600406646728516, + "learning_rate": 1.1002040825842794e-05, + "loss": 2.6273, + "step": 7527000 + }, + { + "epoch": 2.3400329827299196, + "grad_norm": 7.972754955291748, + "learning_rate": 1.0999450287834679e-05, + "loss": 2.6766, + "step": 7527500 + }, + { + "epoch": 2.340188415010406, + "grad_norm": 10.887953758239746, + "learning_rate": 1.0996859749826565e-05, + "loss": 2.6335, + "step": 7528000 + }, + { + "epoch": 2.3403438472908933, + "grad_norm": 10.915627479553223, + "learning_rate": 1.0994269211818448e-05, + "loss": 2.6321, + "step": 7528500 + }, + { + "epoch": 2.3404992795713797, + "grad_norm": 9.25206184387207, + "learning_rate": 1.0991678673810335e-05, + "loss": 2.6372, + "step": 7529000 + }, + { + "epoch": 2.340654711851867, + "grad_norm": 9.062067985534668, + "learning_rate": 1.098908813580222e-05, + "loss": 2.6144, + "step": 7529500 + }, + { + "epoch": 2.340810144132354, + "grad_norm": 10.968213081359863, + "learning_rate": 1.0986497597794106e-05, + "loss": 2.6374, + "step": 7530000 + }, + { + "epoch": 2.3409655764128408, + "grad_norm": 5.241974353790283, + "learning_rate": 1.0983907059785992e-05, + "loss": 2.648, + "step": 7530500 + }, + { + "epoch": 2.3411210086933276, + "grad_norm": 13.317415237426758, + "learning_rate": 1.0981316521777876e-05, + "loss": 2.6385, + "step": 7531000 + }, + { + "epoch": 2.3412764409738145, + "grad_norm": 10.072001457214355, + "learning_rate": 1.0978725983769763e-05, + "loss": 2.6228, + "step": 7531500 + }, + { + "epoch": 2.3414318732543014, + "grad_norm": 8.378700256347656, + "learning_rate": 1.0976135445761646e-05, + "loss": 2.6104, + "step": 7532000 + }, + { + "epoch": 2.341587305534788, + "grad_norm": 10.811497688293457, + "learning_rate": 1.0973544907753534e-05, + "loss": 2.6387, + "step": 7532500 + }, + { + "epoch": 2.341742737815275, + "grad_norm": 14.740769386291504, + "learning_rate": 1.0970954369745417e-05, + "loss": 2.5943, + "step": 7533000 + }, + { + "epoch": 2.341898170095762, + "grad_norm": 9.876063346862793, + "learning_rate": 1.0968363831737303e-05, + "loss": 2.6651, + "step": 7533500 + }, + { + "epoch": 2.342053602376249, + "grad_norm": 8.239027976989746, + "learning_rate": 1.0965773293729188e-05, + "loss": 2.6517, + "step": 7534000 + }, + { + "epoch": 2.3422090346567357, + "grad_norm": 8.42080307006836, + "learning_rate": 1.0963182755721074e-05, + "loss": 2.6561, + "step": 7534500 + }, + { + "epoch": 2.3423644669372226, + "grad_norm": 8.335474014282227, + "learning_rate": 1.0960592217712961e-05, + "loss": 2.6252, + "step": 7535000 + }, + { + "epoch": 2.3425198992177094, + "grad_norm": 21.11940574645996, + "learning_rate": 1.0958001679704845e-05, + "loss": 2.612, + "step": 7535500 + }, + { + "epoch": 2.3426753314981963, + "grad_norm": 9.074005126953125, + "learning_rate": 1.095541114169673e-05, + "loss": 2.5926, + "step": 7536000 + }, + { + "epoch": 2.342830763778683, + "grad_norm": 24.463924407958984, + "learning_rate": 1.0952820603688616e-05, + "loss": 2.6172, + "step": 7536500 + }, + { + "epoch": 2.34298619605917, + "grad_norm": 7.87529182434082, + "learning_rate": 1.0950230065680501e-05, + "loss": 2.6466, + "step": 7537000 + }, + { + "epoch": 2.343141628339657, + "grad_norm": 7.967639923095703, + "learning_rate": 1.0947639527672387e-05, + "loss": 2.6439, + "step": 7537500 + }, + { + "epoch": 2.3432970606201438, + "grad_norm": 9.784982681274414, + "learning_rate": 1.0945048989664272e-05, + "loss": 2.6239, + "step": 7538000 + }, + { + "epoch": 2.3434524929006306, + "grad_norm": 17.241695404052734, + "learning_rate": 1.0942458451656157e-05, + "loss": 2.6565, + "step": 7538500 + }, + { + "epoch": 2.3436079251811175, + "grad_norm": 8.77574348449707, + "learning_rate": 1.0939867913648043e-05, + "loss": 2.6243, + "step": 7539000 + }, + { + "epoch": 2.3437633574616044, + "grad_norm": 12.026435852050781, + "learning_rate": 1.0937277375639928e-05, + "loss": 2.5949, + "step": 7539500 + }, + { + "epoch": 2.3439187897420912, + "grad_norm": 8.586336135864258, + "learning_rate": 1.0934686837631814e-05, + "loss": 2.6301, + "step": 7540000 + }, + { + "epoch": 2.344074222022578, + "grad_norm": 9.447407722473145, + "learning_rate": 1.09320962996237e-05, + "loss": 2.63, + "step": 7540500 + }, + { + "epoch": 2.344229654303065, + "grad_norm": 13.070633888244629, + "learning_rate": 1.0929505761615585e-05, + "loss": 2.6003, + "step": 7541000 + }, + { + "epoch": 2.344385086583552, + "grad_norm": 9.979609489440918, + "learning_rate": 1.092691522360747e-05, + "loss": 2.6678, + "step": 7541500 + }, + { + "epoch": 2.3445405188640387, + "grad_norm": 12.89168643951416, + "learning_rate": 1.0924324685599354e-05, + "loss": 2.604, + "step": 7542000 + }, + { + "epoch": 2.3446959511445256, + "grad_norm": 11.150376319885254, + "learning_rate": 1.0921734147591241e-05, + "loss": 2.636, + "step": 7542500 + }, + { + "epoch": 2.3448513834250124, + "grad_norm": 11.525612831115723, + "learning_rate": 1.0919143609583125e-05, + "loss": 2.6451, + "step": 7543000 + }, + { + "epoch": 2.3450068157054993, + "grad_norm": 8.673556327819824, + "learning_rate": 1.0916553071575012e-05, + "loss": 2.6714, + "step": 7543500 + }, + { + "epoch": 2.345162247985986, + "grad_norm": 11.432686805725098, + "learning_rate": 1.0913962533566898e-05, + "loss": 2.6581, + "step": 7544000 + }, + { + "epoch": 2.345317680266473, + "grad_norm": 8.568177223205566, + "learning_rate": 1.0911371995558781e-05, + "loss": 2.6087, + "step": 7544500 + }, + { + "epoch": 2.34547311254696, + "grad_norm": 12.01173210144043, + "learning_rate": 1.0908781457550668e-05, + "loss": 2.653, + "step": 7545000 + }, + { + "epoch": 2.3456285448274468, + "grad_norm": 9.748370170593262, + "learning_rate": 1.0906190919542552e-05, + "loss": 2.6166, + "step": 7545500 + }, + { + "epoch": 2.3457839771079336, + "grad_norm": 15.57289981842041, + "learning_rate": 1.090360038153444e-05, + "loss": 2.647, + "step": 7546000 + }, + { + "epoch": 2.3459394093884205, + "grad_norm": 8.582416534423828, + "learning_rate": 1.0901009843526323e-05, + "loss": 2.6509, + "step": 7546500 + }, + { + "epoch": 2.3460948416689074, + "grad_norm": 26.83332061767578, + "learning_rate": 1.0898419305518209e-05, + "loss": 2.6335, + "step": 7547000 + }, + { + "epoch": 2.3462502739493942, + "grad_norm": 9.609437942504883, + "learning_rate": 1.0895828767510094e-05, + "loss": 2.6193, + "step": 7547500 + }, + { + "epoch": 2.346405706229881, + "grad_norm": 11.189865112304688, + "learning_rate": 1.089323822950198e-05, + "loss": 2.6083, + "step": 7548000 + }, + { + "epoch": 2.346561138510368, + "grad_norm": 10.115628242492676, + "learning_rate": 1.0890647691493867e-05, + "loss": 2.6432, + "step": 7548500 + }, + { + "epoch": 2.346716570790855, + "grad_norm": 8.483443260192871, + "learning_rate": 1.088805715348575e-05, + "loss": 2.584, + "step": 7549000 + }, + { + "epoch": 2.3468720030713417, + "grad_norm": 9.272431373596191, + "learning_rate": 1.0885466615477636e-05, + "loss": 2.6292, + "step": 7549500 + }, + { + "epoch": 2.3470274353518286, + "grad_norm": 14.458526611328125, + "learning_rate": 1.0882876077469521e-05, + "loss": 2.6714, + "step": 7550000 + }, + { + "epoch": 2.3471828676323154, + "grad_norm": 10.307579040527344, + "learning_rate": 1.0880285539461407e-05, + "loss": 2.6385, + "step": 7550500 + }, + { + "epoch": 2.3473382999128027, + "grad_norm": 8.62504768371582, + "learning_rate": 1.0877695001453292e-05, + "loss": 2.6281, + "step": 7551000 + }, + { + "epoch": 2.347493732193289, + "grad_norm": 17.582595825195312, + "learning_rate": 1.0875104463445178e-05, + "loss": 2.6818, + "step": 7551500 + }, + { + "epoch": 2.3476491644737765, + "grad_norm": 23.64348030090332, + "learning_rate": 1.0872513925437063e-05, + "loss": 2.6422, + "step": 7552000 + }, + { + "epoch": 2.347804596754263, + "grad_norm": 10.053281784057617, + "learning_rate": 1.0869923387428949e-05, + "loss": 2.652, + "step": 7552500 + }, + { + "epoch": 2.34796002903475, + "grad_norm": 10.409714698791504, + "learning_rate": 1.0867332849420834e-05, + "loss": 2.6251, + "step": 7553000 + }, + { + "epoch": 2.3481154613152366, + "grad_norm": 12.985381126403809, + "learning_rate": 1.086474231141272e-05, + "loss": 2.6235, + "step": 7553500 + }, + { + "epoch": 2.348270893595724, + "grad_norm": 9.456718444824219, + "learning_rate": 1.0862151773404605e-05, + "loss": 2.6219, + "step": 7554000 + }, + { + "epoch": 2.348426325876211, + "grad_norm": 9.688949584960938, + "learning_rate": 1.085956123539649e-05, + "loss": 2.6574, + "step": 7554500 + }, + { + "epoch": 2.3485817581566977, + "grad_norm": 10.460811614990234, + "learning_rate": 1.0856970697388376e-05, + "loss": 2.6274, + "step": 7555000 + }, + { + "epoch": 2.3487371904371845, + "grad_norm": 8.16241455078125, + "learning_rate": 1.085438015938026e-05, + "loss": 2.6684, + "step": 7555500 + }, + { + "epoch": 2.3488926227176714, + "grad_norm": 15.688484191894531, + "learning_rate": 1.0851789621372147e-05, + "loss": 2.6594, + "step": 7556000 + }, + { + "epoch": 2.3490480549981583, + "grad_norm": 12.359514236450195, + "learning_rate": 1.084919908336403e-05, + "loss": 2.6264, + "step": 7556500 + }, + { + "epoch": 2.349203487278645, + "grad_norm": 9.17490005493164, + "learning_rate": 1.0846608545355918e-05, + "loss": 2.62, + "step": 7557000 + }, + { + "epoch": 2.349358919559132, + "grad_norm": 13.666428565979004, + "learning_rate": 1.0844018007347803e-05, + "loss": 2.6545, + "step": 7557500 + }, + { + "epoch": 2.349514351839619, + "grad_norm": 17.66437339782715, + "learning_rate": 1.0841427469339687e-05, + "loss": 2.6078, + "step": 7558000 + }, + { + "epoch": 2.3496697841201057, + "grad_norm": 9.53357982635498, + "learning_rate": 1.0838836931331574e-05, + "loss": 2.6058, + "step": 7558500 + }, + { + "epoch": 2.3498252164005926, + "grad_norm": 16.715776443481445, + "learning_rate": 1.0836246393323458e-05, + "loss": 2.6281, + "step": 7559000 + }, + { + "epoch": 2.3499806486810795, + "grad_norm": 12.0059175491333, + "learning_rate": 1.0833655855315345e-05, + "loss": 2.665, + "step": 7559500 + }, + { + "epoch": 2.3501360809615663, + "grad_norm": 9.525666236877441, + "learning_rate": 1.0831065317307229e-05, + "loss": 2.5878, + "step": 7560000 + }, + { + "epoch": 2.350291513242053, + "grad_norm": 11.43161678314209, + "learning_rate": 1.0828474779299114e-05, + "loss": 2.6248, + "step": 7560500 + }, + { + "epoch": 2.35044694552254, + "grad_norm": 8.11133861541748, + "learning_rate": 1.0825884241291e-05, + "loss": 2.6171, + "step": 7561000 + }, + { + "epoch": 2.350602377803027, + "grad_norm": 10.110093116760254, + "learning_rate": 1.0823293703282885e-05, + "loss": 2.6181, + "step": 7561500 + }, + { + "epoch": 2.350757810083514, + "grad_norm": 10.402518272399902, + "learning_rate": 1.0820703165274772e-05, + "loss": 2.6593, + "step": 7562000 + }, + { + "epoch": 2.3509132423640007, + "grad_norm": 15.517383575439453, + "learning_rate": 1.0818112627266656e-05, + "loss": 2.6295, + "step": 7562500 + }, + { + "epoch": 2.3510686746444875, + "grad_norm": 12.019909858703613, + "learning_rate": 1.0815522089258542e-05, + "loss": 2.6214, + "step": 7563000 + }, + { + "epoch": 2.3512241069249744, + "grad_norm": 51.15325164794922, + "learning_rate": 1.0812931551250427e-05, + "loss": 2.5981, + "step": 7563500 + }, + { + "epoch": 2.3513795392054613, + "grad_norm": 10.298105239868164, + "learning_rate": 1.0810341013242312e-05, + "loss": 2.6551, + "step": 7564000 + }, + { + "epoch": 2.351534971485948, + "grad_norm": 26.070186614990234, + "learning_rate": 1.0807750475234198e-05, + "loss": 2.6107, + "step": 7564500 + }, + { + "epoch": 2.351690403766435, + "grad_norm": 6.925928115844727, + "learning_rate": 1.0805159937226083e-05, + "loss": 2.614, + "step": 7565000 + }, + { + "epoch": 2.351845836046922, + "grad_norm": 10.615784645080566, + "learning_rate": 1.0802569399217969e-05, + "loss": 2.6554, + "step": 7565500 + }, + { + "epoch": 2.3520012683274087, + "grad_norm": 10.648160934448242, + "learning_rate": 1.0799978861209854e-05, + "loss": 2.6381, + "step": 7566000 + }, + { + "epoch": 2.3521567006078956, + "grad_norm": 10.997838973999023, + "learning_rate": 1.079738832320174e-05, + "loss": 2.6387, + "step": 7566500 + }, + { + "epoch": 2.3523121328883825, + "grad_norm": 12.445625305175781, + "learning_rate": 1.0794797785193625e-05, + "loss": 2.6834, + "step": 7567000 + }, + { + "epoch": 2.3524675651688693, + "grad_norm": 9.398184776306152, + "learning_rate": 1.079220724718551e-05, + "loss": 2.6387, + "step": 7567500 + }, + { + "epoch": 2.352622997449356, + "grad_norm": 11.230720520019531, + "learning_rate": 1.0789616709177396e-05, + "loss": 2.7212, + "step": 7568000 + }, + { + "epoch": 2.352778429729843, + "grad_norm": 15.9075345993042, + "learning_rate": 1.0787026171169282e-05, + "loss": 2.6395, + "step": 7568500 + }, + { + "epoch": 2.35293386201033, + "grad_norm": 9.99553394317627, + "learning_rate": 1.0784435633161165e-05, + "loss": 2.642, + "step": 7569000 + }, + { + "epoch": 2.353089294290817, + "grad_norm": 9.740135192871094, + "learning_rate": 1.0781845095153053e-05, + "loss": 2.6582, + "step": 7569500 + }, + { + "epoch": 2.3532447265713037, + "grad_norm": 10.385177612304688, + "learning_rate": 1.0779254557144936e-05, + "loss": 2.7018, + "step": 7570000 + }, + { + "epoch": 2.3534001588517905, + "grad_norm": 10.98214340209961, + "learning_rate": 1.0776664019136823e-05, + "loss": 2.6373, + "step": 7570500 + }, + { + "epoch": 2.3535555911322774, + "grad_norm": 10.515395164489746, + "learning_rate": 1.0774073481128709e-05, + "loss": 2.6576, + "step": 7571000 + }, + { + "epoch": 2.3537110234127643, + "grad_norm": 8.981656074523926, + "learning_rate": 1.0771482943120593e-05, + "loss": 2.6323, + "step": 7571500 + }, + { + "epoch": 2.353866455693251, + "grad_norm": 8.558149337768555, + "learning_rate": 1.076889240511248e-05, + "loss": 2.5975, + "step": 7572000 + }, + { + "epoch": 2.354021887973738, + "grad_norm": 12.105164527893066, + "learning_rate": 1.0766301867104364e-05, + "loss": 2.6211, + "step": 7572500 + }, + { + "epoch": 2.354177320254225, + "grad_norm": 16.92572784423828, + "learning_rate": 1.076371132909625e-05, + "loss": 2.6071, + "step": 7573000 + }, + { + "epoch": 2.3543327525347117, + "grad_norm": 9.380132675170898, + "learning_rate": 1.0761120791088135e-05, + "loss": 2.6088, + "step": 7573500 + }, + { + "epoch": 2.3544881848151986, + "grad_norm": 10.951276779174805, + "learning_rate": 1.075853025308002e-05, + "loss": 2.5933, + "step": 7574000 + }, + { + "epoch": 2.3546436170956855, + "grad_norm": 7.5036139488220215, + "learning_rate": 1.0755939715071905e-05, + "loss": 2.6479, + "step": 7574500 + }, + { + "epoch": 2.3547990493761723, + "grad_norm": 10.3972806930542, + "learning_rate": 1.0753349177063791e-05, + "loss": 2.5974, + "step": 7575000 + }, + { + "epoch": 2.3549544816566597, + "grad_norm": 14.842426300048828, + "learning_rate": 1.0750758639055678e-05, + "loss": 2.6203, + "step": 7575500 + }, + { + "epoch": 2.355109913937146, + "grad_norm": 10.671001434326172, + "learning_rate": 1.0748168101047562e-05, + "loss": 2.6393, + "step": 7576000 + }, + { + "epoch": 2.3552653462176334, + "grad_norm": 12.619975090026855, + "learning_rate": 1.0745577563039447e-05, + "loss": 2.6635, + "step": 7576500 + }, + { + "epoch": 2.35542077849812, + "grad_norm": 13.455418586730957, + "learning_rate": 1.0742987025031333e-05, + "loss": 2.5941, + "step": 7577000 + }, + { + "epoch": 2.355576210778607, + "grad_norm": 9.934647560119629, + "learning_rate": 1.0740396487023218e-05, + "loss": 2.6967, + "step": 7577500 + }, + { + "epoch": 2.3557316430590935, + "grad_norm": 7.588305950164795, + "learning_rate": 1.0737805949015104e-05, + "loss": 2.6414, + "step": 7578000 + }, + { + "epoch": 2.355887075339581, + "grad_norm": 42.67139434814453, + "learning_rate": 1.0735215411006989e-05, + "loss": 2.6156, + "step": 7578500 + }, + { + "epoch": 2.3560425076200677, + "grad_norm": 24.10552978515625, + "learning_rate": 1.0732624872998875e-05, + "loss": 2.5985, + "step": 7579000 + }, + { + "epoch": 2.3561979399005546, + "grad_norm": 12.482074737548828, + "learning_rate": 1.073003433499076e-05, + "loss": 2.6176, + "step": 7579500 + }, + { + "epoch": 2.3563533721810415, + "grad_norm": 11.889545440673828, + "learning_rate": 1.0727443796982646e-05, + "loss": 2.6639, + "step": 7580000 + }, + { + "epoch": 2.3565088044615283, + "grad_norm": 11.212080001831055, + "learning_rate": 1.0724853258974531e-05, + "loss": 2.6308, + "step": 7580500 + }, + { + "epoch": 2.356664236742015, + "grad_norm": 16.956336975097656, + "learning_rate": 1.0722262720966416e-05, + "loss": 2.6358, + "step": 7581000 + }, + { + "epoch": 2.356819669022502, + "grad_norm": 9.075366973876953, + "learning_rate": 1.0719672182958302e-05, + "loss": 2.6247, + "step": 7581500 + }, + { + "epoch": 2.356975101302989, + "grad_norm": 9.762151718139648, + "learning_rate": 1.0717081644950187e-05, + "loss": 2.6366, + "step": 7582000 + }, + { + "epoch": 2.357130533583476, + "grad_norm": 7.273064613342285, + "learning_rate": 1.0714491106942073e-05, + "loss": 2.632, + "step": 7582500 + }, + { + "epoch": 2.3572859658639627, + "grad_norm": 9.120776176452637, + "learning_rate": 1.0711900568933958e-05, + "loss": 2.658, + "step": 7583000 + }, + { + "epoch": 2.3574413981444495, + "grad_norm": 8.127216339111328, + "learning_rate": 1.0709310030925842e-05, + "loss": 2.6323, + "step": 7583500 + }, + { + "epoch": 2.3575968304249364, + "grad_norm": 13.74614429473877, + "learning_rate": 1.070671949291773e-05, + "loss": 2.6341, + "step": 7584000 + }, + { + "epoch": 2.3577522627054233, + "grad_norm": 11.787015914916992, + "learning_rate": 1.0704128954909615e-05, + "loss": 2.6079, + "step": 7584500 + }, + { + "epoch": 2.35790769498591, + "grad_norm": 10.056839942932129, + "learning_rate": 1.07015384169015e-05, + "loss": 2.6436, + "step": 7585000 + }, + { + "epoch": 2.358063127266397, + "grad_norm": 11.377031326293945, + "learning_rate": 1.0698947878893386e-05, + "loss": 2.6702, + "step": 7585500 + }, + { + "epoch": 2.358218559546884, + "grad_norm": 14.574565887451172, + "learning_rate": 1.069635734088527e-05, + "loss": 2.6285, + "step": 7586000 + }, + { + "epoch": 2.3583739918273707, + "grad_norm": 40.639591217041016, + "learning_rate": 1.0693766802877156e-05, + "loss": 2.6506, + "step": 7586500 + }, + { + "epoch": 2.3585294241078576, + "grad_norm": 32.367393493652344, + "learning_rate": 1.069117626486904e-05, + "loss": 2.635, + "step": 7587000 + }, + { + "epoch": 2.3586848563883445, + "grad_norm": 16.197654724121094, + "learning_rate": 1.0688585726860927e-05, + "loss": 2.6225, + "step": 7587500 + }, + { + "epoch": 2.3588402886688313, + "grad_norm": 13.732818603515625, + "learning_rate": 1.0685995188852811e-05, + "loss": 2.6534, + "step": 7588000 + }, + { + "epoch": 2.358995720949318, + "grad_norm": 20.310131072998047, + "learning_rate": 1.0683404650844697e-05, + "loss": 2.6553, + "step": 7588500 + }, + { + "epoch": 2.359151153229805, + "grad_norm": 6.450648784637451, + "learning_rate": 1.0680814112836584e-05, + "loss": 2.6922, + "step": 7589000 + }, + { + "epoch": 2.359306585510292, + "grad_norm": 9.306100845336914, + "learning_rate": 1.0678223574828468e-05, + "loss": 2.6169, + "step": 7589500 + }, + { + "epoch": 2.359462017790779, + "grad_norm": 9.1519193649292, + "learning_rate": 1.0675633036820355e-05, + "loss": 2.663, + "step": 7590000 + }, + { + "epoch": 2.3596174500712657, + "grad_norm": 10.845722198486328, + "learning_rate": 1.0673042498812238e-05, + "loss": 2.6541, + "step": 7590500 + }, + { + "epoch": 2.3597728823517525, + "grad_norm": 44.77931213378906, + "learning_rate": 1.0670451960804124e-05, + "loss": 2.5989, + "step": 7591000 + }, + { + "epoch": 2.3599283146322394, + "grad_norm": 9.064468383789062, + "learning_rate": 1.066786142279601e-05, + "loss": 2.6174, + "step": 7591500 + }, + { + "epoch": 2.3600837469127263, + "grad_norm": 12.83952808380127, + "learning_rate": 1.0665270884787895e-05, + "loss": 2.6546, + "step": 7592000 + }, + { + "epoch": 2.360239179193213, + "grad_norm": 16.0427188873291, + "learning_rate": 1.066268034677978e-05, + "loss": 2.5999, + "step": 7592500 + }, + { + "epoch": 2.3603946114737, + "grad_norm": 8.894068717956543, + "learning_rate": 1.0660089808771666e-05, + "loss": 2.5765, + "step": 7593000 + }, + { + "epoch": 2.360550043754187, + "grad_norm": 9.635061264038086, + "learning_rate": 1.0657499270763551e-05, + "loss": 2.6486, + "step": 7593500 + }, + { + "epoch": 2.3607054760346737, + "grad_norm": 10.002092361450195, + "learning_rate": 1.0654908732755437e-05, + "loss": 2.6422, + "step": 7594000 + }, + { + "epoch": 2.3608609083151606, + "grad_norm": 14.068400382995605, + "learning_rate": 1.0652318194747322e-05, + "loss": 2.6679, + "step": 7594500 + }, + { + "epoch": 2.3610163405956475, + "grad_norm": 8.706127166748047, + "learning_rate": 1.0649727656739208e-05, + "loss": 2.5883, + "step": 7595000 + }, + { + "epoch": 2.3611717728761343, + "grad_norm": 14.783721923828125, + "learning_rate": 1.0647137118731093e-05, + "loss": 2.6104, + "step": 7595500 + }, + { + "epoch": 2.361327205156621, + "grad_norm": 8.07502555847168, + "learning_rate": 1.0644546580722979e-05, + "loss": 2.667, + "step": 7596000 + }, + { + "epoch": 2.361482637437108, + "grad_norm": 11.370097160339355, + "learning_rate": 1.0641956042714864e-05, + "loss": 2.6592, + "step": 7596500 + }, + { + "epoch": 2.361638069717595, + "grad_norm": 14.09086799621582, + "learning_rate": 1.0639365504706748e-05, + "loss": 2.634, + "step": 7597000 + }, + { + "epoch": 2.361793501998082, + "grad_norm": 24.64467430114746, + "learning_rate": 1.0636774966698635e-05, + "loss": 2.628, + "step": 7597500 + }, + { + "epoch": 2.3619489342785687, + "grad_norm": 8.839207649230957, + "learning_rate": 1.063418442869052e-05, + "loss": 2.6491, + "step": 7598000 + }, + { + "epoch": 2.3621043665590555, + "grad_norm": 9.246192932128906, + "learning_rate": 1.0631593890682406e-05, + "loss": 2.6267, + "step": 7598500 + }, + { + "epoch": 2.3622597988395424, + "grad_norm": 8.55553913116455, + "learning_rate": 1.0629003352674291e-05, + "loss": 2.6566, + "step": 7599000 + }, + { + "epoch": 2.3624152311200293, + "grad_norm": 9.016192436218262, + "learning_rate": 1.0626412814666175e-05, + "loss": 2.6386, + "step": 7599500 + }, + { + "epoch": 2.3625706634005166, + "grad_norm": 32.29362869262695, + "learning_rate": 1.0623822276658062e-05, + "loss": 2.642, + "step": 7600000 + }, + { + "epoch": 2.362726095681003, + "grad_norm": 9.201004981994629, + "learning_rate": 1.0621231738649946e-05, + "loss": 2.6218, + "step": 7600500 + }, + { + "epoch": 2.3628815279614903, + "grad_norm": 9.797408103942871, + "learning_rate": 1.0618641200641833e-05, + "loss": 2.657, + "step": 7601000 + }, + { + "epoch": 2.3630369602419767, + "grad_norm": 17.449567794799805, + "learning_rate": 1.0616050662633717e-05, + "loss": 2.622, + "step": 7601500 + }, + { + "epoch": 2.363192392522464, + "grad_norm": 9.580248832702637, + "learning_rate": 1.0613460124625602e-05, + "loss": 2.642, + "step": 7602000 + }, + { + "epoch": 2.363347824802951, + "grad_norm": 15.109284400939941, + "learning_rate": 1.061086958661749e-05, + "loss": 2.6451, + "step": 7602500 + }, + { + "epoch": 2.363503257083438, + "grad_norm": 9.649141311645508, + "learning_rate": 1.0608279048609373e-05, + "loss": 2.6427, + "step": 7603000 + }, + { + "epoch": 2.3636586893639246, + "grad_norm": 8.526068687438965, + "learning_rate": 1.060568851060126e-05, + "loss": 2.6887, + "step": 7603500 + }, + { + "epoch": 2.3638141216444115, + "grad_norm": 10.62264347076416, + "learning_rate": 1.0603097972593144e-05, + "loss": 2.6182, + "step": 7604000 + }, + { + "epoch": 2.3639695539248984, + "grad_norm": 12.146322250366211, + "learning_rate": 1.060050743458503e-05, + "loss": 2.6428, + "step": 7604500 + }, + { + "epoch": 2.3641249862053852, + "grad_norm": 14.49087142944336, + "learning_rate": 1.0597916896576915e-05, + "loss": 2.6016, + "step": 7605000 + }, + { + "epoch": 2.364280418485872, + "grad_norm": 11.389734268188477, + "learning_rate": 1.05953263585688e-05, + "loss": 2.6373, + "step": 7605500 + }, + { + "epoch": 2.364435850766359, + "grad_norm": 9.65492057800293, + "learning_rate": 1.0592735820560686e-05, + "loss": 2.6541, + "step": 7606000 + }, + { + "epoch": 2.364591283046846, + "grad_norm": 9.311172485351562, + "learning_rate": 1.0590145282552571e-05, + "loss": 2.6413, + "step": 7606500 + }, + { + "epoch": 2.3647467153273327, + "grad_norm": 9.95719051361084, + "learning_rate": 1.0587554744544457e-05, + "loss": 2.5874, + "step": 7607000 + }, + { + "epoch": 2.3649021476078196, + "grad_norm": 9.114496231079102, + "learning_rate": 1.0584964206536342e-05, + "loss": 2.6713, + "step": 7607500 + }, + { + "epoch": 2.3650575798883064, + "grad_norm": 8.763101577758789, + "learning_rate": 1.0582373668528228e-05, + "loss": 2.6592, + "step": 7608000 + }, + { + "epoch": 2.3652130121687933, + "grad_norm": 10.300747871398926, + "learning_rate": 1.0579783130520113e-05, + "loss": 2.6123, + "step": 7608500 + }, + { + "epoch": 2.36536844444928, + "grad_norm": 8.868996620178223, + "learning_rate": 1.0577192592511999e-05, + "loss": 2.6086, + "step": 7609000 + }, + { + "epoch": 2.365523876729767, + "grad_norm": 9.803224563598633, + "learning_rate": 1.0574602054503884e-05, + "loss": 2.6539, + "step": 7609500 + }, + { + "epoch": 2.365679309010254, + "grad_norm": 9.924738883972168, + "learning_rate": 1.057201151649577e-05, + "loss": 2.6211, + "step": 7610000 + }, + { + "epoch": 2.365834741290741, + "grad_norm": 10.605155944824219, + "learning_rate": 1.0569420978487653e-05, + "loss": 2.6343, + "step": 7610500 + }, + { + "epoch": 2.3659901735712277, + "grad_norm": 10.22475814819336, + "learning_rate": 1.056683044047954e-05, + "loss": 2.6071, + "step": 7611000 + }, + { + "epoch": 2.3661456058517145, + "grad_norm": 11.85415267944336, + "learning_rate": 1.0564239902471426e-05, + "loss": 2.6184, + "step": 7611500 + }, + { + "epoch": 2.3663010381322014, + "grad_norm": 11.809859275817871, + "learning_rate": 1.0561649364463312e-05, + "loss": 2.623, + "step": 7612000 + }, + { + "epoch": 2.3664564704126883, + "grad_norm": 11.192787170410156, + "learning_rate": 1.0559058826455197e-05, + "loss": 2.6271, + "step": 7612500 + }, + { + "epoch": 2.366611902693175, + "grad_norm": 10.811203956604004, + "learning_rate": 1.055646828844708e-05, + "loss": 2.6226, + "step": 7613000 + }, + { + "epoch": 2.366767334973662, + "grad_norm": 11.529502868652344, + "learning_rate": 1.0553877750438968e-05, + "loss": 2.6591, + "step": 7613500 + }, + { + "epoch": 2.366922767254149, + "grad_norm": 8.778847694396973, + "learning_rate": 1.0551287212430852e-05, + "loss": 2.5917, + "step": 7614000 + }, + { + "epoch": 2.3670781995346357, + "grad_norm": 10.953516006469727, + "learning_rate": 1.0548696674422739e-05, + "loss": 2.6477, + "step": 7614500 + }, + { + "epoch": 2.3672336318151226, + "grad_norm": 18.924108505249023, + "learning_rate": 1.0546106136414623e-05, + "loss": 2.601, + "step": 7615000 + }, + { + "epoch": 2.3673890640956095, + "grad_norm": 15.541810035705566, + "learning_rate": 1.0543515598406508e-05, + "loss": 2.6221, + "step": 7615500 + }, + { + "epoch": 2.3675444963760963, + "grad_norm": 9.764793395996094, + "learning_rate": 1.0540925060398395e-05, + "loss": 2.6107, + "step": 7616000 + }, + { + "epoch": 2.367699928656583, + "grad_norm": 11.855210304260254, + "learning_rate": 1.0538334522390279e-05, + "loss": 2.6402, + "step": 7616500 + }, + { + "epoch": 2.36785536093707, + "grad_norm": 9.734063148498535, + "learning_rate": 1.0535743984382166e-05, + "loss": 2.6168, + "step": 7617000 + }, + { + "epoch": 2.368010793217557, + "grad_norm": 8.47498893737793, + "learning_rate": 1.053315344637405e-05, + "loss": 2.6386, + "step": 7617500 + }, + { + "epoch": 2.368166225498044, + "grad_norm": 9.856094360351562, + "learning_rate": 1.0530562908365935e-05, + "loss": 2.6654, + "step": 7618000 + }, + { + "epoch": 2.3683216577785307, + "grad_norm": 15.113788604736328, + "learning_rate": 1.052797237035782e-05, + "loss": 2.5685, + "step": 7618500 + }, + { + "epoch": 2.3684770900590175, + "grad_norm": 9.667193412780762, + "learning_rate": 1.0525381832349706e-05, + "loss": 2.6295, + "step": 7619000 + }, + { + "epoch": 2.3686325223395044, + "grad_norm": 10.97118091583252, + "learning_rate": 1.0522791294341592e-05, + "loss": 2.6087, + "step": 7619500 + }, + { + "epoch": 2.3687879546199913, + "grad_norm": 12.258295059204102, + "learning_rate": 1.0520200756333477e-05, + "loss": 2.6023, + "step": 7620000 + }, + { + "epoch": 2.368943386900478, + "grad_norm": 11.123725891113281, + "learning_rate": 1.0517610218325363e-05, + "loss": 2.5773, + "step": 7620500 + }, + { + "epoch": 2.369098819180965, + "grad_norm": 15.862106323242188, + "learning_rate": 1.0515019680317248e-05, + "loss": 2.6417, + "step": 7621000 + }, + { + "epoch": 2.369254251461452, + "grad_norm": 8.027904510498047, + "learning_rate": 1.0512429142309134e-05, + "loss": 2.6532, + "step": 7621500 + }, + { + "epoch": 2.3694096837419387, + "grad_norm": 18.33783721923828, + "learning_rate": 1.0509838604301019e-05, + "loss": 2.5993, + "step": 7622000 + }, + { + "epoch": 2.3695651160224256, + "grad_norm": 10.659879684448242, + "learning_rate": 1.0507248066292904e-05, + "loss": 2.6073, + "step": 7622500 + }, + { + "epoch": 2.3697205483029125, + "grad_norm": 12.252805709838867, + "learning_rate": 1.050465752828479e-05, + "loss": 2.6669, + "step": 7623000 + }, + { + "epoch": 2.3698759805833998, + "grad_norm": 10.77575397491455, + "learning_rate": 1.0502066990276675e-05, + "loss": 2.6305, + "step": 7623500 + }, + { + "epoch": 2.370031412863886, + "grad_norm": 13.874811172485352, + "learning_rate": 1.0499476452268559e-05, + "loss": 2.5955, + "step": 7624000 + }, + { + "epoch": 2.3701868451443735, + "grad_norm": 11.615581512451172, + "learning_rate": 1.0496885914260446e-05, + "loss": 2.6553, + "step": 7624500 + }, + { + "epoch": 2.37034227742486, + "grad_norm": 21.355409622192383, + "learning_rate": 1.0494295376252332e-05, + "loss": 2.6065, + "step": 7625000 + }, + { + "epoch": 2.3704977097053472, + "grad_norm": 9.236777305603027, + "learning_rate": 1.0491704838244217e-05, + "loss": 2.6098, + "step": 7625500 + }, + { + "epoch": 2.3706531419858337, + "grad_norm": 9.85305404663086, + "learning_rate": 1.0489114300236103e-05, + "loss": 2.6168, + "step": 7626000 + }, + { + "epoch": 2.370808574266321, + "grad_norm": 13.91291332244873, + "learning_rate": 1.0486523762227986e-05, + "loss": 2.6224, + "step": 7626500 + }, + { + "epoch": 2.370964006546808, + "grad_norm": 9.247631072998047, + "learning_rate": 1.0483933224219874e-05, + "loss": 2.6409, + "step": 7627000 + }, + { + "epoch": 2.3711194388272947, + "grad_norm": 10.483152389526367, + "learning_rate": 1.0481342686211757e-05, + "loss": 2.6169, + "step": 7627500 + }, + { + "epoch": 2.3712748711077816, + "grad_norm": 11.664372444152832, + "learning_rate": 1.0478752148203645e-05, + "loss": 2.5625, + "step": 7628000 + }, + { + "epoch": 2.3714303033882684, + "grad_norm": 9.599113464355469, + "learning_rate": 1.0476161610195528e-05, + "loss": 2.6196, + "step": 7628500 + }, + { + "epoch": 2.3715857356687553, + "grad_norm": 9.653324127197266, + "learning_rate": 1.0473571072187414e-05, + "loss": 2.618, + "step": 7629000 + }, + { + "epoch": 2.371741167949242, + "grad_norm": 12.698034286499023, + "learning_rate": 1.0470980534179301e-05, + "loss": 2.6393, + "step": 7629500 + }, + { + "epoch": 2.371896600229729, + "grad_norm": 8.93449878692627, + "learning_rate": 1.0468389996171185e-05, + "loss": 2.6127, + "step": 7630000 + }, + { + "epoch": 2.372052032510216, + "grad_norm": 12.683562278747559, + "learning_rate": 1.0465799458163072e-05, + "loss": 2.656, + "step": 7630500 + }, + { + "epoch": 2.3722074647907028, + "grad_norm": 10.62509536743164, + "learning_rate": 1.0463208920154956e-05, + "loss": 2.6354, + "step": 7631000 + }, + { + "epoch": 2.3723628970711896, + "grad_norm": 13.467142105102539, + "learning_rate": 1.0460618382146841e-05, + "loss": 2.6145, + "step": 7631500 + }, + { + "epoch": 2.3725183293516765, + "grad_norm": 7.963897705078125, + "learning_rate": 1.0458027844138726e-05, + "loss": 2.5995, + "step": 7632000 + }, + { + "epoch": 2.3726737616321634, + "grad_norm": 9.103007316589355, + "learning_rate": 1.0455437306130612e-05, + "loss": 2.5903, + "step": 7632500 + }, + { + "epoch": 2.3728291939126502, + "grad_norm": 8.143733024597168, + "learning_rate": 1.0452846768122497e-05, + "loss": 2.6128, + "step": 7633000 + }, + { + "epoch": 2.372984626193137, + "grad_norm": 11.383515357971191, + "learning_rate": 1.0450256230114383e-05, + "loss": 2.6349, + "step": 7633500 + }, + { + "epoch": 2.373140058473624, + "grad_norm": 8.105177879333496, + "learning_rate": 1.0447665692106268e-05, + "loss": 2.697, + "step": 7634000 + }, + { + "epoch": 2.373295490754111, + "grad_norm": 9.933363914489746, + "learning_rate": 1.0445075154098154e-05, + "loss": 2.6097, + "step": 7634500 + }, + { + "epoch": 2.3734509230345977, + "grad_norm": 12.927339553833008, + "learning_rate": 1.044248461609004e-05, + "loss": 2.663, + "step": 7635000 + }, + { + "epoch": 2.3736063553150846, + "grad_norm": 41.19318771362305, + "learning_rate": 1.0439894078081925e-05, + "loss": 2.6295, + "step": 7635500 + }, + { + "epoch": 2.3737617875955714, + "grad_norm": 9.678160667419434, + "learning_rate": 1.043730354007381e-05, + "loss": 2.6173, + "step": 7636000 + }, + { + "epoch": 2.3739172198760583, + "grad_norm": 9.714021682739258, + "learning_rate": 1.0434713002065696e-05, + "loss": 2.6311, + "step": 7636500 + }, + { + "epoch": 2.374072652156545, + "grad_norm": 10.466194152832031, + "learning_rate": 1.0432122464057581e-05, + "loss": 2.6202, + "step": 7637000 + }, + { + "epoch": 2.374228084437032, + "grad_norm": 12.60486125946045, + "learning_rate": 1.0429531926049467e-05, + "loss": 2.6229, + "step": 7637500 + }, + { + "epoch": 2.374383516717519, + "grad_norm": 10.789605140686035, + "learning_rate": 1.0426941388041352e-05, + "loss": 2.6493, + "step": 7638000 + }, + { + "epoch": 2.3745389489980058, + "grad_norm": 10.093914985656738, + "learning_rate": 1.0424350850033237e-05, + "loss": 2.6201, + "step": 7638500 + }, + { + "epoch": 2.3746943812784926, + "grad_norm": 13.960576057434082, + "learning_rate": 1.0421760312025123e-05, + "loss": 2.6647, + "step": 7639000 + }, + { + "epoch": 2.3748498135589795, + "grad_norm": 9.291492462158203, + "learning_rate": 1.0419169774017008e-05, + "loss": 2.6635, + "step": 7639500 + }, + { + "epoch": 2.3750052458394664, + "grad_norm": 9.925461769104004, + "learning_rate": 1.0416579236008892e-05, + "loss": 2.5827, + "step": 7640000 + }, + { + "epoch": 2.3751606781199532, + "grad_norm": 9.232441902160645, + "learning_rate": 1.041398869800078e-05, + "loss": 2.6628, + "step": 7640500 + }, + { + "epoch": 2.37531611040044, + "grad_norm": 8.620626449584961, + "learning_rate": 1.0411398159992663e-05, + "loss": 2.646, + "step": 7641000 + }, + { + "epoch": 2.375471542680927, + "grad_norm": 8.420385360717773, + "learning_rate": 1.040880762198455e-05, + "loss": 2.6389, + "step": 7641500 + }, + { + "epoch": 2.375626974961414, + "grad_norm": 10.42725658416748, + "learning_rate": 1.0406217083976436e-05, + "loss": 2.6327, + "step": 7642000 + }, + { + "epoch": 2.3757824072419007, + "grad_norm": 9.56383991241455, + "learning_rate": 1.040362654596832e-05, + "loss": 2.6379, + "step": 7642500 + }, + { + "epoch": 2.3759378395223876, + "grad_norm": 9.439765930175781, + "learning_rate": 1.0401036007960207e-05, + "loss": 2.6036, + "step": 7643000 + }, + { + "epoch": 2.3760932718028744, + "grad_norm": 8.388829231262207, + "learning_rate": 1.039844546995209e-05, + "loss": 2.6169, + "step": 7643500 + }, + { + "epoch": 2.3762487040833613, + "grad_norm": 18.007244110107422, + "learning_rate": 1.0395854931943978e-05, + "loss": 2.6252, + "step": 7644000 + }, + { + "epoch": 2.376404136363848, + "grad_norm": 15.843867301940918, + "learning_rate": 1.0393264393935861e-05, + "loss": 2.644, + "step": 7644500 + }, + { + "epoch": 2.376559568644335, + "grad_norm": 9.718220710754395, + "learning_rate": 1.0390673855927747e-05, + "loss": 2.6034, + "step": 7645000 + }, + { + "epoch": 2.376715000924822, + "grad_norm": 11.495186805725098, + "learning_rate": 1.0388083317919632e-05, + "loss": 2.6211, + "step": 7645500 + }, + { + "epoch": 2.3768704332053088, + "grad_norm": 13.756104469299316, + "learning_rate": 1.0385492779911518e-05, + "loss": 2.6051, + "step": 7646000 + }, + { + "epoch": 2.3770258654857956, + "grad_norm": 9.495745658874512, + "learning_rate": 1.0382902241903405e-05, + "loss": 2.6577, + "step": 7646500 + }, + { + "epoch": 2.3771812977662825, + "grad_norm": 10.108380317687988, + "learning_rate": 1.0380311703895289e-05, + "loss": 2.6433, + "step": 7647000 + }, + { + "epoch": 2.3773367300467694, + "grad_norm": 18.33209991455078, + "learning_rate": 1.0377721165887174e-05, + "loss": 2.6398, + "step": 7647500 + }, + { + "epoch": 2.3774921623272567, + "grad_norm": 9.070531845092773, + "learning_rate": 1.037513062787906e-05, + "loss": 2.5972, + "step": 7648000 + }, + { + "epoch": 2.377647594607743, + "grad_norm": 15.578889846801758, + "learning_rate": 1.0372540089870945e-05, + "loss": 2.6263, + "step": 7648500 + }, + { + "epoch": 2.3778030268882304, + "grad_norm": 10.114837646484375, + "learning_rate": 1.036994955186283e-05, + "loss": 2.656, + "step": 7649000 + }, + { + "epoch": 2.377958459168717, + "grad_norm": 8.547218322753906, + "learning_rate": 1.0367359013854716e-05, + "loss": 2.6275, + "step": 7649500 + }, + { + "epoch": 2.378113891449204, + "grad_norm": 10.860450744628906, + "learning_rate": 1.0364768475846601e-05, + "loss": 2.6247, + "step": 7650000 + }, + { + "epoch": 2.378269323729691, + "grad_norm": 11.055370330810547, + "learning_rate": 1.0362177937838487e-05, + "loss": 2.5708, + "step": 7650500 + }, + { + "epoch": 2.378424756010178, + "grad_norm": 14.889405250549316, + "learning_rate": 1.0359587399830372e-05, + "loss": 2.6082, + "step": 7651000 + }, + { + "epoch": 2.3785801882906648, + "grad_norm": 12.64211654663086, + "learning_rate": 1.0356996861822258e-05, + "loss": 2.6215, + "step": 7651500 + }, + { + "epoch": 2.3787356205711516, + "grad_norm": 9.486395835876465, + "learning_rate": 1.0354406323814143e-05, + "loss": 2.6104, + "step": 7652000 + }, + { + "epoch": 2.3788910528516385, + "grad_norm": 9.960641860961914, + "learning_rate": 1.0351815785806029e-05, + "loss": 2.6272, + "step": 7652500 + }, + { + "epoch": 2.3790464851321254, + "grad_norm": 9.660746574401855, + "learning_rate": 1.0349225247797914e-05, + "loss": 2.6675, + "step": 7653000 + }, + { + "epoch": 2.379201917412612, + "grad_norm": 13.352054595947266, + "learning_rate": 1.0346634709789798e-05, + "loss": 2.6224, + "step": 7653500 + }, + { + "epoch": 2.379357349693099, + "grad_norm": 9.431139945983887, + "learning_rate": 1.0344044171781685e-05, + "loss": 2.619, + "step": 7654000 + }, + { + "epoch": 2.379512781973586, + "grad_norm": 8.40485668182373, + "learning_rate": 1.0341453633773569e-05, + "loss": 2.6786, + "step": 7654500 + }, + { + "epoch": 2.379668214254073, + "grad_norm": 13.761187553405762, + "learning_rate": 1.0338863095765456e-05, + "loss": 2.6064, + "step": 7655000 + }, + { + "epoch": 2.3798236465345597, + "grad_norm": 10.032649993896484, + "learning_rate": 1.0336272557757341e-05, + "loss": 2.6026, + "step": 7655500 + }, + { + "epoch": 2.3799790788150466, + "grad_norm": 10.052594184875488, + "learning_rate": 1.0333682019749225e-05, + "loss": 2.6503, + "step": 7656000 + }, + { + "epoch": 2.3801345110955334, + "grad_norm": 8.191683769226074, + "learning_rate": 1.0331091481741112e-05, + "loss": 2.6469, + "step": 7656500 + }, + { + "epoch": 2.3802899433760203, + "grad_norm": 10.101724624633789, + "learning_rate": 1.0328500943732996e-05, + "loss": 2.6572, + "step": 7657000 + }, + { + "epoch": 2.380445375656507, + "grad_norm": 11.210586547851562, + "learning_rate": 1.0325910405724883e-05, + "loss": 2.627, + "step": 7657500 + }, + { + "epoch": 2.380600807936994, + "grad_norm": 13.692480087280273, + "learning_rate": 1.0323319867716767e-05, + "loss": 2.6432, + "step": 7658000 + }, + { + "epoch": 2.380756240217481, + "grad_norm": 14.123024940490723, + "learning_rate": 1.0320729329708652e-05, + "loss": 2.6057, + "step": 7658500 + }, + { + "epoch": 2.3809116724979678, + "grad_norm": 9.935215950012207, + "learning_rate": 1.0318138791700538e-05, + "loss": 2.6002, + "step": 7659000 + }, + { + "epoch": 2.3810671047784546, + "grad_norm": 11.770517349243164, + "learning_rate": 1.0315548253692423e-05, + "loss": 2.6138, + "step": 7659500 + }, + { + "epoch": 2.3812225370589415, + "grad_norm": 8.472552299499512, + "learning_rate": 1.031295771568431e-05, + "loss": 2.5991, + "step": 7660000 + }, + { + "epoch": 2.3813779693394284, + "grad_norm": 15.274025917053223, + "learning_rate": 1.0310367177676194e-05, + "loss": 2.6378, + "step": 7660500 + }, + { + "epoch": 2.3815334016199152, + "grad_norm": 10.858792304992676, + "learning_rate": 1.030777663966808e-05, + "loss": 2.6016, + "step": 7661000 + }, + { + "epoch": 2.381688833900402, + "grad_norm": 10.900794982910156, + "learning_rate": 1.0305186101659965e-05, + "loss": 2.6194, + "step": 7661500 + }, + { + "epoch": 2.381844266180889, + "grad_norm": 8.639680862426758, + "learning_rate": 1.030259556365185e-05, + "loss": 2.6697, + "step": 7662000 + }, + { + "epoch": 2.381999698461376, + "grad_norm": 8.34786319732666, + "learning_rate": 1.0300005025643736e-05, + "loss": 2.628, + "step": 7662500 + }, + { + "epoch": 2.3821551307418627, + "grad_norm": 12.60185718536377, + "learning_rate": 1.0297414487635622e-05, + "loss": 2.6001, + "step": 7663000 + }, + { + "epoch": 2.3823105630223496, + "grad_norm": 13.31654167175293, + "learning_rate": 1.0294823949627507e-05, + "loss": 2.649, + "step": 7663500 + }, + { + "epoch": 2.3824659953028364, + "grad_norm": 9.748499870300293, + "learning_rate": 1.0292233411619392e-05, + "loss": 2.6468, + "step": 7664000 + }, + { + "epoch": 2.3826214275833233, + "grad_norm": 14.681357383728027, + "learning_rate": 1.0289642873611278e-05, + "loss": 2.6327, + "step": 7664500 + }, + { + "epoch": 2.38277685986381, + "grad_norm": 7.962843894958496, + "learning_rate": 1.0287052335603163e-05, + "loss": 2.6062, + "step": 7665000 + }, + { + "epoch": 2.382932292144297, + "grad_norm": 10.070387840270996, + "learning_rate": 1.0284461797595049e-05, + "loss": 2.6304, + "step": 7665500 + }, + { + "epoch": 2.383087724424784, + "grad_norm": 11.4183988571167, + "learning_rate": 1.0281871259586934e-05, + "loss": 2.6081, + "step": 7666000 + }, + { + "epoch": 2.3832431567052708, + "grad_norm": 9.705262184143066, + "learning_rate": 1.027928072157882e-05, + "loss": 2.6289, + "step": 7666500 + }, + { + "epoch": 2.3833985889857576, + "grad_norm": 6.324457168579102, + "learning_rate": 1.0276690183570704e-05, + "loss": 2.582, + "step": 7667000 + }, + { + "epoch": 2.3835540212662445, + "grad_norm": 10.573335647583008, + "learning_rate": 1.027409964556259e-05, + "loss": 2.631, + "step": 7667500 + }, + { + "epoch": 2.3837094535467314, + "grad_norm": 9.279868125915527, + "learning_rate": 1.0271509107554474e-05, + "loss": 2.6289, + "step": 7668000 + }, + { + "epoch": 2.3838648858272182, + "grad_norm": 9.322511672973633, + "learning_rate": 1.0268918569546362e-05, + "loss": 2.6244, + "step": 7668500 + }, + { + "epoch": 2.384020318107705, + "grad_norm": 10.549835205078125, + "learning_rate": 1.0266328031538247e-05, + "loss": 2.6101, + "step": 7669000 + }, + { + "epoch": 2.384175750388192, + "grad_norm": 11.404762268066406, + "learning_rate": 1.0263737493530131e-05, + "loss": 2.6406, + "step": 7669500 + }, + { + "epoch": 2.384331182668679, + "grad_norm": 11.872332572937012, + "learning_rate": 1.0261146955522018e-05, + "loss": 2.6404, + "step": 7670000 + }, + { + "epoch": 2.3844866149491657, + "grad_norm": 9.65018367767334, + "learning_rate": 1.0258556417513902e-05, + "loss": 2.6297, + "step": 7670500 + }, + { + "epoch": 2.3846420472296526, + "grad_norm": 9.822711944580078, + "learning_rate": 1.0255965879505789e-05, + "loss": 2.6781, + "step": 7671000 + }, + { + "epoch": 2.38479747951014, + "grad_norm": 11.30522632598877, + "learning_rate": 1.0253375341497673e-05, + "loss": 2.5919, + "step": 7671500 + }, + { + "epoch": 2.3849529117906263, + "grad_norm": 8.496849060058594, + "learning_rate": 1.0250784803489558e-05, + "loss": 2.627, + "step": 7672000 + }, + { + "epoch": 2.3851083440711136, + "grad_norm": 9.035840034484863, + "learning_rate": 1.0248194265481444e-05, + "loss": 2.5565, + "step": 7672500 + }, + { + "epoch": 2.3852637763516, + "grad_norm": 20.779598236083984, + "learning_rate": 1.0245603727473329e-05, + "loss": 2.6344, + "step": 7673000 + }, + { + "epoch": 2.3854192086320873, + "grad_norm": 8.985211372375488, + "learning_rate": 1.0243013189465216e-05, + "loss": 2.6329, + "step": 7673500 + }, + { + "epoch": 2.3855746409125738, + "grad_norm": 9.757477760314941, + "learning_rate": 1.02404226514571e-05, + "loss": 2.5938, + "step": 7674000 + }, + { + "epoch": 2.385730073193061, + "grad_norm": 13.491518020629883, + "learning_rate": 1.0237832113448985e-05, + "loss": 2.6135, + "step": 7674500 + }, + { + "epoch": 2.385885505473548, + "grad_norm": 10.188042640686035, + "learning_rate": 1.0235241575440871e-05, + "loss": 2.6757, + "step": 7675000 + }, + { + "epoch": 2.386040937754035, + "grad_norm": 10.002630233764648, + "learning_rate": 1.0232651037432756e-05, + "loss": 2.6995, + "step": 7675500 + }, + { + "epoch": 2.3861963700345217, + "grad_norm": 11.466670036315918, + "learning_rate": 1.0230060499424642e-05, + "loss": 2.6144, + "step": 7676000 + }, + { + "epoch": 2.3863518023150085, + "grad_norm": 9.074424743652344, + "learning_rate": 1.0227469961416527e-05, + "loss": 2.6465, + "step": 7676500 + }, + { + "epoch": 2.3865072345954954, + "grad_norm": 9.680421829223633, + "learning_rate": 1.0224879423408413e-05, + "loss": 2.6551, + "step": 7677000 + }, + { + "epoch": 2.3866626668759823, + "grad_norm": 24.02916717529297, + "learning_rate": 1.0222288885400298e-05, + "loss": 2.626, + "step": 7677500 + }, + { + "epoch": 2.386818099156469, + "grad_norm": 8.735042572021484, + "learning_rate": 1.0219698347392184e-05, + "loss": 2.6346, + "step": 7678000 + }, + { + "epoch": 2.386973531436956, + "grad_norm": 12.470072746276855, + "learning_rate": 1.0217107809384069e-05, + "loss": 2.6564, + "step": 7678500 + }, + { + "epoch": 2.387128963717443, + "grad_norm": 11.162729263305664, + "learning_rate": 1.0214517271375955e-05, + "loss": 2.6532, + "step": 7679000 + }, + { + "epoch": 2.3872843959979297, + "grad_norm": 10.625710487365723, + "learning_rate": 1.021192673336784e-05, + "loss": 2.6329, + "step": 7679500 + }, + { + "epoch": 2.3874398282784166, + "grad_norm": 9.949363708496094, + "learning_rate": 1.0209336195359726e-05, + "loss": 2.656, + "step": 7680000 + }, + { + "epoch": 2.3875952605589035, + "grad_norm": 7.581353664398193, + "learning_rate": 1.0206745657351611e-05, + "loss": 2.6457, + "step": 7680500 + }, + { + "epoch": 2.3877506928393903, + "grad_norm": 6.690342426300049, + "learning_rate": 1.0204155119343496e-05, + "loss": 2.6084, + "step": 7681000 + }, + { + "epoch": 2.387906125119877, + "grad_norm": 9.629535675048828, + "learning_rate": 1.020156458133538e-05, + "loss": 2.6691, + "step": 7681500 + }, + { + "epoch": 2.388061557400364, + "grad_norm": 11.575016021728516, + "learning_rate": 1.0198974043327267e-05, + "loss": 2.6249, + "step": 7682000 + }, + { + "epoch": 2.388216989680851, + "grad_norm": 8.650436401367188, + "learning_rate": 1.0196383505319153e-05, + "loss": 2.6126, + "step": 7682500 + }, + { + "epoch": 2.388372421961338, + "grad_norm": 10.24293327331543, + "learning_rate": 1.0193792967311038e-05, + "loss": 2.6512, + "step": 7683000 + }, + { + "epoch": 2.3885278542418247, + "grad_norm": 9.64506721496582, + "learning_rate": 1.0191202429302924e-05, + "loss": 2.6228, + "step": 7683500 + }, + { + "epoch": 2.3886832865223115, + "grad_norm": 9.162193298339844, + "learning_rate": 1.0188611891294807e-05, + "loss": 2.598, + "step": 7684000 + }, + { + "epoch": 2.3888387188027984, + "grad_norm": 8.912632942199707, + "learning_rate": 1.0186021353286695e-05, + "loss": 2.6248, + "step": 7684500 + }, + { + "epoch": 2.3889941510832853, + "grad_norm": 11.173699378967285, + "learning_rate": 1.0183430815278578e-05, + "loss": 2.6508, + "step": 7685000 + }, + { + "epoch": 2.389149583363772, + "grad_norm": 26.24358558654785, + "learning_rate": 1.0180840277270466e-05, + "loss": 2.6701, + "step": 7685500 + }, + { + "epoch": 2.389305015644259, + "grad_norm": 9.319127082824707, + "learning_rate": 1.017824973926235e-05, + "loss": 2.6692, + "step": 7686000 + }, + { + "epoch": 2.389460447924746, + "grad_norm": 10.082698822021484, + "learning_rate": 1.0175659201254235e-05, + "loss": 2.6572, + "step": 7686500 + }, + { + "epoch": 2.3896158802052327, + "grad_norm": 9.687638282775879, + "learning_rate": 1.0173068663246122e-05, + "loss": 2.6565, + "step": 7687000 + }, + { + "epoch": 2.3897713124857196, + "grad_norm": 9.664369583129883, + "learning_rate": 1.0170478125238006e-05, + "loss": 2.6459, + "step": 7687500 + }, + { + "epoch": 2.3899267447662065, + "grad_norm": 9.993730545043945, + "learning_rate": 1.0167887587229893e-05, + "loss": 2.6026, + "step": 7688000 + }, + { + "epoch": 2.3900821770466933, + "grad_norm": 10.745393753051758, + "learning_rate": 1.0165297049221777e-05, + "loss": 2.6451, + "step": 7688500 + }, + { + "epoch": 2.39023760932718, + "grad_norm": 8.250924110412598, + "learning_rate": 1.0162706511213662e-05, + "loss": 2.5883, + "step": 7689000 + }, + { + "epoch": 2.390393041607667, + "grad_norm": 10.659090995788574, + "learning_rate": 1.0160115973205548e-05, + "loss": 2.62, + "step": 7689500 + }, + { + "epoch": 2.390548473888154, + "grad_norm": 10.78033447265625, + "learning_rate": 1.0157525435197433e-05, + "loss": 2.6179, + "step": 7690000 + }, + { + "epoch": 2.390703906168641, + "grad_norm": 11.544971466064453, + "learning_rate": 1.0154934897189318e-05, + "loss": 2.5976, + "step": 7690500 + }, + { + "epoch": 2.3908593384491277, + "grad_norm": 9.458563804626465, + "learning_rate": 1.0152344359181204e-05, + "loss": 2.6419, + "step": 7691000 + }, + { + "epoch": 2.3910147707296145, + "grad_norm": 20.450700759887695, + "learning_rate": 1.014975382117309e-05, + "loss": 2.6715, + "step": 7691500 + }, + { + "epoch": 2.3911702030101014, + "grad_norm": 10.120070457458496, + "learning_rate": 1.0147163283164975e-05, + "loss": 2.6268, + "step": 7692000 + }, + { + "epoch": 2.3913256352905883, + "grad_norm": 10.222755432128906, + "learning_rate": 1.014457274515686e-05, + "loss": 2.6507, + "step": 7692500 + }, + { + "epoch": 2.391481067571075, + "grad_norm": 10.095602035522461, + "learning_rate": 1.0141982207148746e-05, + "loss": 2.637, + "step": 7693000 + }, + { + "epoch": 2.391636499851562, + "grad_norm": 23.960575103759766, + "learning_rate": 1.0139391669140631e-05, + "loss": 2.6486, + "step": 7693500 + }, + { + "epoch": 2.391791932132049, + "grad_norm": 11.78918170928955, + "learning_rate": 1.0136801131132517e-05, + "loss": 2.6301, + "step": 7694000 + }, + { + "epoch": 2.3919473644125357, + "grad_norm": 12.402305603027344, + "learning_rate": 1.0134210593124402e-05, + "loss": 2.6051, + "step": 7694500 + }, + { + "epoch": 2.3921027966930226, + "grad_norm": 8.589320182800293, + "learning_rate": 1.0131620055116286e-05, + "loss": 2.6148, + "step": 7695000 + }, + { + "epoch": 2.3922582289735095, + "grad_norm": 11.574929237365723, + "learning_rate": 1.0129029517108173e-05, + "loss": 2.6487, + "step": 7695500 + }, + { + "epoch": 2.392413661253997, + "grad_norm": 12.30440902709961, + "learning_rate": 1.0126438979100059e-05, + "loss": 2.5842, + "step": 7696000 + }, + { + "epoch": 2.392569093534483, + "grad_norm": 13.353715896606445, + "learning_rate": 1.0123848441091944e-05, + "loss": 2.6128, + "step": 7696500 + }, + { + "epoch": 2.3927245258149705, + "grad_norm": 10.303857803344727, + "learning_rate": 1.012125790308383e-05, + "loss": 2.6511, + "step": 7697000 + }, + { + "epoch": 2.392879958095457, + "grad_norm": 9.229736328125, + "learning_rate": 1.0118667365075713e-05, + "loss": 2.6346, + "step": 7697500 + }, + { + "epoch": 2.3930353903759443, + "grad_norm": 22.51342010498047, + "learning_rate": 1.01160768270676e-05, + "loss": 2.6263, + "step": 7698000 + }, + { + "epoch": 2.393190822656431, + "grad_norm": 8.750147819519043, + "learning_rate": 1.0113486289059484e-05, + "loss": 2.5756, + "step": 7698500 + }, + { + "epoch": 2.393346254936918, + "grad_norm": 10.113512992858887, + "learning_rate": 1.0110895751051371e-05, + "loss": 2.6557, + "step": 7699000 + }, + { + "epoch": 2.393501687217405, + "grad_norm": 13.216445922851562, + "learning_rate": 1.0108305213043255e-05, + "loss": 2.6189, + "step": 7699500 + }, + { + "epoch": 2.3936571194978917, + "grad_norm": 10.45602035522461, + "learning_rate": 1.010571467503514e-05, + "loss": 2.6183, + "step": 7700000 + }, + { + "epoch": 2.3938125517783786, + "grad_norm": 16.32117462158203, + "learning_rate": 1.0103124137027028e-05, + "loss": 2.6276, + "step": 7700500 + }, + { + "epoch": 2.3939679840588655, + "grad_norm": 7.844968318939209, + "learning_rate": 1.0100533599018911e-05, + "loss": 2.6628, + "step": 7701000 + }, + { + "epoch": 2.3941234163393523, + "grad_norm": 12.01993465423584, + "learning_rate": 1.0097943061010799e-05, + "loss": 2.6319, + "step": 7701500 + }, + { + "epoch": 2.394278848619839, + "grad_norm": 13.998226165771484, + "learning_rate": 1.0095352523002682e-05, + "loss": 2.6243, + "step": 7702000 + }, + { + "epoch": 2.394434280900326, + "grad_norm": 10.022564888000488, + "learning_rate": 1.0092761984994568e-05, + "loss": 2.5949, + "step": 7702500 + }, + { + "epoch": 2.394589713180813, + "grad_norm": 10.942117691040039, + "learning_rate": 1.0090171446986453e-05, + "loss": 2.646, + "step": 7703000 + }, + { + "epoch": 2.3947451454613, + "grad_norm": 9.294859886169434, + "learning_rate": 1.0087580908978339e-05, + "loss": 2.6562, + "step": 7703500 + }, + { + "epoch": 2.3949005777417867, + "grad_norm": 10.879551887512207, + "learning_rate": 1.0084990370970224e-05, + "loss": 2.6216, + "step": 7704000 + }, + { + "epoch": 2.3950560100222735, + "grad_norm": 16.44019889831543, + "learning_rate": 1.008239983296211e-05, + "loss": 2.6258, + "step": 7704500 + }, + { + "epoch": 2.3952114423027604, + "grad_norm": 11.768026351928711, + "learning_rate": 1.0079809294953995e-05, + "loss": 2.6362, + "step": 7705000 + }, + { + "epoch": 2.3953668745832473, + "grad_norm": 10.19976806640625, + "learning_rate": 1.007721875694588e-05, + "loss": 2.6824, + "step": 7705500 + }, + { + "epoch": 2.395522306863734, + "grad_norm": 10.727747917175293, + "learning_rate": 1.0074628218937766e-05, + "loss": 2.6391, + "step": 7706000 + }, + { + "epoch": 2.395677739144221, + "grad_norm": 9.79180908203125, + "learning_rate": 1.0072037680929651e-05, + "loss": 2.6336, + "step": 7706500 + }, + { + "epoch": 2.395833171424708, + "grad_norm": 8.815486907958984, + "learning_rate": 1.0069447142921537e-05, + "loss": 2.6091, + "step": 7707000 + }, + { + "epoch": 2.3959886037051947, + "grad_norm": 8.763662338256836, + "learning_rate": 1.0066856604913422e-05, + "loss": 2.5881, + "step": 7707500 + }, + { + "epoch": 2.3961440359856816, + "grad_norm": 13.834298133850098, + "learning_rate": 1.0064266066905308e-05, + "loss": 2.6485, + "step": 7708000 + }, + { + "epoch": 2.3962994682661685, + "grad_norm": 19.15938949584961, + "learning_rate": 1.0061675528897192e-05, + "loss": 2.6163, + "step": 7708500 + }, + { + "epoch": 2.3964549005466553, + "grad_norm": 10.879526138305664, + "learning_rate": 1.0059084990889079e-05, + "loss": 2.6138, + "step": 7709000 + }, + { + "epoch": 2.396610332827142, + "grad_norm": 9.326117515563965, + "learning_rate": 1.0056494452880964e-05, + "loss": 2.6218, + "step": 7709500 + }, + { + "epoch": 2.396765765107629, + "grad_norm": 10.21293830871582, + "learning_rate": 1.005390391487285e-05, + "loss": 2.6178, + "step": 7710000 + }, + { + "epoch": 2.396921197388116, + "grad_norm": 52.65516662597656, + "learning_rate": 1.0051313376864735e-05, + "loss": 2.5841, + "step": 7710500 + }, + { + "epoch": 2.397076629668603, + "grad_norm": 10.710970878601074, + "learning_rate": 1.0048722838856619e-05, + "loss": 2.6237, + "step": 7711000 + }, + { + "epoch": 2.3972320619490897, + "grad_norm": 9.382680892944336, + "learning_rate": 1.0046132300848506e-05, + "loss": 2.6351, + "step": 7711500 + }, + { + "epoch": 2.3973874942295765, + "grad_norm": 9.120769500732422, + "learning_rate": 1.004354176284039e-05, + "loss": 2.63, + "step": 7712000 + }, + { + "epoch": 2.3975429265100634, + "grad_norm": 10.064581871032715, + "learning_rate": 1.0040951224832277e-05, + "loss": 2.5897, + "step": 7712500 + }, + { + "epoch": 2.3976983587905503, + "grad_norm": 10.514862060546875, + "learning_rate": 1.003836068682416e-05, + "loss": 2.5999, + "step": 7713000 + }, + { + "epoch": 2.397853791071037, + "grad_norm": 10.048866271972656, + "learning_rate": 1.0035770148816046e-05, + "loss": 2.6053, + "step": 7713500 + }, + { + "epoch": 2.398009223351524, + "grad_norm": 9.89942741394043, + "learning_rate": 1.0033179610807933e-05, + "loss": 2.6298, + "step": 7714000 + }, + { + "epoch": 2.398164655632011, + "grad_norm": 12.819928169250488, + "learning_rate": 1.0030589072799817e-05, + "loss": 2.6016, + "step": 7714500 + }, + { + "epoch": 2.3983200879124977, + "grad_norm": 11.66602611541748, + "learning_rate": 1.0027998534791704e-05, + "loss": 2.6349, + "step": 7715000 + }, + { + "epoch": 2.3984755201929846, + "grad_norm": 8.86362075805664, + "learning_rate": 1.0025407996783588e-05, + "loss": 2.6017, + "step": 7715500 + }, + { + "epoch": 2.3986309524734715, + "grad_norm": 9.620673179626465, + "learning_rate": 1.0022817458775473e-05, + "loss": 2.5714, + "step": 7716000 + }, + { + "epoch": 2.3987863847539583, + "grad_norm": 11.359984397888184, + "learning_rate": 1.0020226920767359e-05, + "loss": 2.6566, + "step": 7716500 + }, + { + "epoch": 2.398941817034445, + "grad_norm": 11.644124031066895, + "learning_rate": 1.0017636382759244e-05, + "loss": 2.5821, + "step": 7717000 + }, + { + "epoch": 2.399097249314932, + "grad_norm": 16.28424072265625, + "learning_rate": 1.001504584475113e-05, + "loss": 2.6115, + "step": 7717500 + }, + { + "epoch": 2.399252681595419, + "grad_norm": 11.644513130187988, + "learning_rate": 1.0012455306743015e-05, + "loss": 2.6066, + "step": 7718000 + }, + { + "epoch": 2.399408113875906, + "grad_norm": 10.011406898498535, + "learning_rate": 1.00098647687349e-05, + "loss": 2.6119, + "step": 7718500 + }, + { + "epoch": 2.3995635461563927, + "grad_norm": 10.85783863067627, + "learning_rate": 1.0007274230726786e-05, + "loss": 2.6032, + "step": 7719000 + }, + { + "epoch": 2.3997189784368795, + "grad_norm": 16.579347610473633, + "learning_rate": 1.0004683692718672e-05, + "loss": 2.6793, + "step": 7719500 + }, + { + "epoch": 2.3998744107173664, + "grad_norm": 11.394848823547363, + "learning_rate": 1.0002093154710557e-05, + "loss": 2.6037, + "step": 7720000 + }, + { + "epoch": 2.4000298429978537, + "grad_norm": 12.568849563598633, + "learning_rate": 9.999502616702443e-06, + "loss": 2.6171, + "step": 7720500 + }, + { + "epoch": 2.40018527527834, + "grad_norm": 9.177180290222168, + "learning_rate": 9.996912078694328e-06, + "loss": 2.6235, + "step": 7721000 + }, + { + "epoch": 2.4003407075588274, + "grad_norm": 9.893975257873535, + "learning_rate": 9.994321540686214e-06, + "loss": 2.6283, + "step": 7721500 + }, + { + "epoch": 2.400496139839314, + "grad_norm": 10.221390724182129, + "learning_rate": 9.991731002678097e-06, + "loss": 2.5843, + "step": 7722000 + }, + { + "epoch": 2.400651572119801, + "grad_norm": 12.001571655273438, + "learning_rate": 9.989140464669984e-06, + "loss": 2.6108, + "step": 7722500 + }, + { + "epoch": 2.400807004400288, + "grad_norm": 14.040224075317383, + "learning_rate": 9.98654992666187e-06, + "loss": 2.6389, + "step": 7723000 + }, + { + "epoch": 2.400962436680775, + "grad_norm": 11.932395935058594, + "learning_rate": 9.983959388653755e-06, + "loss": 2.643, + "step": 7723500 + }, + { + "epoch": 2.401117868961262, + "grad_norm": 11.767428398132324, + "learning_rate": 9.98136885064564e-06, + "loss": 2.6098, + "step": 7724000 + }, + { + "epoch": 2.4012733012417486, + "grad_norm": 7.753969192504883, + "learning_rate": 9.978778312637525e-06, + "loss": 2.6478, + "step": 7724500 + }, + { + "epoch": 2.4014287335222355, + "grad_norm": 9.423369407653809, + "learning_rate": 9.976187774629412e-06, + "loss": 2.6626, + "step": 7725000 + }, + { + "epoch": 2.4015841658027224, + "grad_norm": 10.764932632446289, + "learning_rate": 9.973597236621296e-06, + "loss": 2.6552, + "step": 7725500 + }, + { + "epoch": 2.4017395980832092, + "grad_norm": 8.420109748840332, + "learning_rate": 9.971006698613183e-06, + "loss": 2.5917, + "step": 7726000 + }, + { + "epoch": 2.401895030363696, + "grad_norm": 11.828045845031738, + "learning_rate": 9.968416160605066e-06, + "loss": 2.5878, + "step": 7726500 + }, + { + "epoch": 2.402050462644183, + "grad_norm": 9.43191146850586, + "learning_rate": 9.965825622596952e-06, + "loss": 2.6442, + "step": 7727000 + }, + { + "epoch": 2.40220589492467, + "grad_norm": 12.448067665100098, + "learning_rate": 9.963235084588839e-06, + "loss": 2.616, + "step": 7727500 + }, + { + "epoch": 2.4023613272051567, + "grad_norm": 9.306260108947754, + "learning_rate": 9.960644546580723e-06, + "loss": 2.6917, + "step": 7728000 + }, + { + "epoch": 2.4025167594856436, + "grad_norm": 8.860391616821289, + "learning_rate": 9.95805400857261e-06, + "loss": 2.6373, + "step": 7728500 + }, + { + "epoch": 2.4026721917661304, + "grad_norm": 10.52589225769043, + "learning_rate": 9.955463470564494e-06, + "loss": 2.6653, + "step": 7729000 + }, + { + "epoch": 2.4028276240466173, + "grad_norm": 13.089813232421875, + "learning_rate": 9.95287293255638e-06, + "loss": 2.6054, + "step": 7729500 + }, + { + "epoch": 2.402983056327104, + "grad_norm": 11.567262649536133, + "learning_rate": 9.950282394548265e-06, + "loss": 2.6283, + "step": 7730000 + }, + { + "epoch": 2.403138488607591, + "grad_norm": 10.792601585388184, + "learning_rate": 9.94769185654015e-06, + "loss": 2.6795, + "step": 7730500 + }, + { + "epoch": 2.403293920888078, + "grad_norm": 10.592126846313477, + "learning_rate": 9.945101318532036e-06, + "loss": 2.6096, + "step": 7731000 + }, + { + "epoch": 2.403449353168565, + "grad_norm": 11.581417083740234, + "learning_rate": 9.942510780523921e-06, + "loss": 2.6141, + "step": 7731500 + }, + { + "epoch": 2.4036047854490517, + "grad_norm": 18.079078674316406, + "learning_rate": 9.939920242515806e-06, + "loss": 2.6155, + "step": 7732000 + }, + { + "epoch": 2.4037602177295385, + "grad_norm": 9.505462646484375, + "learning_rate": 9.937329704507692e-06, + "loss": 2.5983, + "step": 7732500 + }, + { + "epoch": 2.4039156500100254, + "grad_norm": 10.85403823852539, + "learning_rate": 9.934739166499577e-06, + "loss": 2.6345, + "step": 7733000 + }, + { + "epoch": 2.4040710822905123, + "grad_norm": 9.10004997253418, + "learning_rate": 9.932148628491463e-06, + "loss": 2.6549, + "step": 7733500 + }, + { + "epoch": 2.404226514570999, + "grad_norm": 9.649248123168945, + "learning_rate": 9.929558090483348e-06, + "loss": 2.607, + "step": 7734000 + }, + { + "epoch": 2.404381946851486, + "grad_norm": 9.895893096923828, + "learning_rate": 9.926967552475234e-06, + "loss": 2.6177, + "step": 7734500 + }, + { + "epoch": 2.404537379131973, + "grad_norm": 8.922016143798828, + "learning_rate": 9.92437701446712e-06, + "loss": 2.6137, + "step": 7735000 + }, + { + "epoch": 2.4046928114124597, + "grad_norm": 9.374663352966309, + "learning_rate": 9.921786476459003e-06, + "loss": 2.6345, + "step": 7735500 + }, + { + "epoch": 2.4048482436929466, + "grad_norm": 9.74415111541748, + "learning_rate": 9.91919593845089e-06, + "loss": 2.6344, + "step": 7736000 + }, + { + "epoch": 2.4050036759734335, + "grad_norm": 10.604957580566406, + "learning_rate": 9.916605400442776e-06, + "loss": 2.634, + "step": 7736500 + }, + { + "epoch": 2.4051591082539203, + "grad_norm": 10.547706604003906, + "learning_rate": 9.914014862434661e-06, + "loss": 2.67, + "step": 7737000 + }, + { + "epoch": 2.405314540534407, + "grad_norm": 10.315150260925293, + "learning_rate": 9.911424324426547e-06, + "loss": 2.6334, + "step": 7737500 + }, + { + "epoch": 2.405469972814894, + "grad_norm": 10.650662422180176, + "learning_rate": 9.90883378641843e-06, + "loss": 2.6189, + "step": 7738000 + }, + { + "epoch": 2.405625405095381, + "grad_norm": 9.972326278686523, + "learning_rate": 9.906243248410317e-06, + "loss": 2.6178, + "step": 7738500 + }, + { + "epoch": 2.405780837375868, + "grad_norm": 12.339733123779297, + "learning_rate": 9.903652710402201e-06, + "loss": 2.6386, + "step": 7739000 + }, + { + "epoch": 2.4059362696563547, + "grad_norm": 10.303030014038086, + "learning_rate": 9.901062172394088e-06, + "loss": 2.6503, + "step": 7739500 + }, + { + "epoch": 2.4060917019368415, + "grad_norm": 12.868949890136719, + "learning_rate": 9.898471634385972e-06, + "loss": 2.646, + "step": 7740000 + }, + { + "epoch": 2.4062471342173284, + "grad_norm": 17.075197219848633, + "learning_rate": 9.895881096377858e-06, + "loss": 2.5854, + "step": 7740500 + }, + { + "epoch": 2.4064025664978153, + "grad_norm": 13.966090202331543, + "learning_rate": 9.893290558369745e-06, + "loss": 2.6256, + "step": 7741000 + }, + { + "epoch": 2.406557998778302, + "grad_norm": 32.36052322387695, + "learning_rate": 9.890700020361629e-06, + "loss": 2.6388, + "step": 7741500 + }, + { + "epoch": 2.406713431058789, + "grad_norm": 10.196980476379395, + "learning_rate": 9.888109482353516e-06, + "loss": 2.6378, + "step": 7742000 + }, + { + "epoch": 2.406868863339276, + "grad_norm": 10.045252799987793, + "learning_rate": 9.8855189443454e-06, + "loss": 2.6472, + "step": 7742500 + }, + { + "epoch": 2.4070242956197627, + "grad_norm": 10.607633590698242, + "learning_rate": 9.882928406337285e-06, + "loss": 2.6035, + "step": 7743000 + }, + { + "epoch": 2.4071797279002496, + "grad_norm": 17.53315544128418, + "learning_rate": 9.88033786832917e-06, + "loss": 2.6516, + "step": 7743500 + }, + { + "epoch": 2.407335160180737, + "grad_norm": 8.38535213470459, + "learning_rate": 9.877747330321056e-06, + "loss": 2.6145, + "step": 7744000 + }, + { + "epoch": 2.4074905924612233, + "grad_norm": 7.605174541473389, + "learning_rate": 9.875156792312941e-06, + "loss": 2.6356, + "step": 7744500 + }, + { + "epoch": 2.4076460247417106, + "grad_norm": 8.998193740844727, + "learning_rate": 9.872566254304827e-06, + "loss": 2.6679, + "step": 7745000 + }, + { + "epoch": 2.407801457022197, + "grad_norm": 10.93878173828125, + "learning_rate": 9.869975716296712e-06, + "loss": 2.6278, + "step": 7745500 + }, + { + "epoch": 2.4079568893026844, + "grad_norm": 10.023466110229492, + "learning_rate": 9.867385178288598e-06, + "loss": 2.6838, + "step": 7746000 + }, + { + "epoch": 2.408112321583171, + "grad_norm": 9.893556594848633, + "learning_rate": 9.864794640280483e-06, + "loss": 2.6484, + "step": 7746500 + }, + { + "epoch": 2.408267753863658, + "grad_norm": 11.876932144165039, + "learning_rate": 9.862204102272369e-06, + "loss": 2.5998, + "step": 7747000 + }, + { + "epoch": 2.408423186144145, + "grad_norm": 11.24642562866211, + "learning_rate": 9.859613564264254e-06, + "loss": 2.6725, + "step": 7747500 + }, + { + "epoch": 2.408578618424632, + "grad_norm": 11.16543197631836, + "learning_rate": 9.85702302625614e-06, + "loss": 2.6413, + "step": 7748000 + }, + { + "epoch": 2.4087340507051187, + "grad_norm": 9.960685729980469, + "learning_rate": 9.854432488248025e-06, + "loss": 2.6562, + "step": 7748500 + }, + { + "epoch": 2.4088894829856056, + "grad_norm": 13.185468673706055, + "learning_rate": 9.851841950239909e-06, + "loss": 2.6508, + "step": 7749000 + }, + { + "epoch": 2.4090449152660924, + "grad_norm": 14.727001190185547, + "learning_rate": 9.849251412231796e-06, + "loss": 2.6491, + "step": 7749500 + }, + { + "epoch": 2.4092003475465793, + "grad_norm": 9.500683784484863, + "learning_rate": 9.846660874223681e-06, + "loss": 2.6117, + "step": 7750000 + }, + { + "epoch": 2.409355779827066, + "grad_norm": 9.4154052734375, + "learning_rate": 9.844070336215567e-06, + "loss": 2.6401, + "step": 7750500 + }, + { + "epoch": 2.409511212107553, + "grad_norm": 8.830779075622559, + "learning_rate": 9.841479798207452e-06, + "loss": 2.6538, + "step": 7751000 + }, + { + "epoch": 2.40966664438804, + "grad_norm": 16.521114349365234, + "learning_rate": 9.838889260199336e-06, + "loss": 2.6231, + "step": 7751500 + }, + { + "epoch": 2.4098220766685268, + "grad_norm": 7.9215216636657715, + "learning_rate": 9.836298722191223e-06, + "loss": 2.6431, + "step": 7752000 + }, + { + "epoch": 2.4099775089490136, + "grad_norm": 10.024120330810547, + "learning_rate": 9.833708184183107e-06, + "loss": 2.6617, + "step": 7752500 + }, + { + "epoch": 2.4101329412295005, + "grad_norm": 10.063916206359863, + "learning_rate": 9.831117646174994e-06, + "loss": 2.6289, + "step": 7753000 + }, + { + "epoch": 2.4102883735099874, + "grad_norm": 9.932272911071777, + "learning_rate": 9.828527108166878e-06, + "loss": 2.6038, + "step": 7753500 + }, + { + "epoch": 2.4104438057904742, + "grad_norm": 13.824145317077637, + "learning_rate": 9.825936570158763e-06, + "loss": 2.6725, + "step": 7754000 + }, + { + "epoch": 2.410599238070961, + "grad_norm": 7.681090831756592, + "learning_rate": 9.82334603215065e-06, + "loss": 2.6291, + "step": 7754500 + }, + { + "epoch": 2.410754670351448, + "grad_norm": 10.090906143188477, + "learning_rate": 9.820755494142534e-06, + "loss": 2.601, + "step": 7755000 + }, + { + "epoch": 2.410910102631935, + "grad_norm": 11.261488914489746, + "learning_rate": 9.818164956134421e-06, + "loss": 2.6826, + "step": 7755500 + }, + { + "epoch": 2.4110655349124217, + "grad_norm": 16.539779663085938, + "learning_rate": 9.815574418126305e-06, + "loss": 2.638, + "step": 7756000 + }, + { + "epoch": 2.4112209671929086, + "grad_norm": 10.87126636505127, + "learning_rate": 9.81298388011819e-06, + "loss": 2.6221, + "step": 7756500 + }, + { + "epoch": 2.4113763994733954, + "grad_norm": 9.57846736907959, + "learning_rate": 9.810393342110076e-06, + "loss": 2.5768, + "step": 7757000 + }, + { + "epoch": 2.4115318317538823, + "grad_norm": 6.346871852874756, + "learning_rate": 9.807802804101962e-06, + "loss": 2.6509, + "step": 7757500 + }, + { + "epoch": 2.411687264034369, + "grad_norm": 9.020130157470703, + "learning_rate": 9.805212266093847e-06, + "loss": 2.6188, + "step": 7758000 + }, + { + "epoch": 2.411842696314856, + "grad_norm": 9.355635643005371, + "learning_rate": 9.802621728085732e-06, + "loss": 2.6197, + "step": 7758500 + }, + { + "epoch": 2.411998128595343, + "grad_norm": 9.621983528137207, + "learning_rate": 9.800031190077618e-06, + "loss": 2.5846, + "step": 7759000 + }, + { + "epoch": 2.4121535608758298, + "grad_norm": 10.459296226501465, + "learning_rate": 9.797440652069503e-06, + "loss": 2.6401, + "step": 7759500 + }, + { + "epoch": 2.4123089931563166, + "grad_norm": 9.285515785217285, + "learning_rate": 9.794850114061389e-06, + "loss": 2.6133, + "step": 7760000 + }, + { + "epoch": 2.4124644254368035, + "grad_norm": 12.781890869140625, + "learning_rate": 9.792259576053274e-06, + "loss": 2.6322, + "step": 7760500 + }, + { + "epoch": 2.4126198577172904, + "grad_norm": 7.283337593078613, + "learning_rate": 9.78966903804516e-06, + "loss": 2.6166, + "step": 7761000 + }, + { + "epoch": 2.4127752899977772, + "grad_norm": 9.34042739868164, + "learning_rate": 9.787078500037045e-06, + "loss": 2.6196, + "step": 7761500 + }, + { + "epoch": 2.412930722278264, + "grad_norm": 9.987308502197266, + "learning_rate": 9.78448796202893e-06, + "loss": 2.588, + "step": 7762000 + }, + { + "epoch": 2.413086154558751, + "grad_norm": 9.129901885986328, + "learning_rate": 9.781897424020814e-06, + "loss": 2.6725, + "step": 7762500 + }, + { + "epoch": 2.413241586839238, + "grad_norm": 8.565194129943848, + "learning_rate": 9.779306886012702e-06, + "loss": 2.5924, + "step": 7763000 + }, + { + "epoch": 2.4133970191197247, + "grad_norm": 9.481795310974121, + "learning_rate": 9.776716348004587e-06, + "loss": 2.6437, + "step": 7763500 + }, + { + "epoch": 2.4135524514002116, + "grad_norm": 8.39042854309082, + "learning_rate": 9.774125809996472e-06, + "loss": 2.5982, + "step": 7764000 + }, + { + "epoch": 2.4137078836806984, + "grad_norm": 11.635832786560059, + "learning_rate": 9.771535271988358e-06, + "loss": 2.6125, + "step": 7764500 + }, + { + "epoch": 2.4138633159611853, + "grad_norm": 14.81212043762207, + "learning_rate": 9.768944733980242e-06, + "loss": 2.6225, + "step": 7765000 + }, + { + "epoch": 2.414018748241672, + "grad_norm": 32.08842849731445, + "learning_rate": 9.766354195972129e-06, + "loss": 2.6292, + "step": 7765500 + }, + { + "epoch": 2.414174180522159, + "grad_norm": 9.374916076660156, + "learning_rate": 9.763763657964013e-06, + "loss": 2.6402, + "step": 7766000 + }, + { + "epoch": 2.414329612802646, + "grad_norm": 12.64730453491211, + "learning_rate": 9.7611731199559e-06, + "loss": 2.5891, + "step": 7766500 + }, + { + "epoch": 2.4144850450831328, + "grad_norm": 9.58344554901123, + "learning_rate": 9.758582581947785e-06, + "loss": 2.6035, + "step": 7767000 + }, + { + "epoch": 2.4146404773636196, + "grad_norm": 9.26862907409668, + "learning_rate": 9.755992043939669e-06, + "loss": 2.6562, + "step": 7767500 + }, + { + "epoch": 2.4147959096441065, + "grad_norm": 7.890481472015381, + "learning_rate": 9.753401505931556e-06, + "loss": 2.6166, + "step": 7768000 + }, + { + "epoch": 2.414951341924594, + "grad_norm": 10.50665283203125, + "learning_rate": 9.75081096792344e-06, + "loss": 2.5922, + "step": 7768500 + }, + { + "epoch": 2.4151067742050802, + "grad_norm": 22.702552795410156, + "learning_rate": 9.748220429915327e-06, + "loss": 2.6052, + "step": 7769000 + }, + { + "epoch": 2.4152622064855676, + "grad_norm": 8.961986541748047, + "learning_rate": 9.745629891907211e-06, + "loss": 2.6144, + "step": 7769500 + }, + { + "epoch": 2.415417638766054, + "grad_norm": 11.619855880737305, + "learning_rate": 9.743039353899098e-06, + "loss": 2.596, + "step": 7770000 + }, + { + "epoch": 2.4155730710465413, + "grad_norm": 10.253137588500977, + "learning_rate": 9.740448815890982e-06, + "loss": 2.6134, + "step": 7770500 + }, + { + "epoch": 2.415728503327028, + "grad_norm": 9.412673950195312, + "learning_rate": 9.737858277882867e-06, + "loss": 2.5592, + "step": 7771000 + }, + { + "epoch": 2.415883935607515, + "grad_norm": 8.860984802246094, + "learning_rate": 9.735267739874754e-06, + "loss": 2.6107, + "step": 7771500 + }, + { + "epoch": 2.416039367888002, + "grad_norm": 9.331381797790527, + "learning_rate": 9.732677201866638e-06, + "loss": 2.5787, + "step": 7772000 + }, + { + "epoch": 2.4161948001684888, + "grad_norm": 8.695039749145508, + "learning_rate": 9.730086663858525e-06, + "loss": 2.6206, + "step": 7772500 + }, + { + "epoch": 2.4163502324489756, + "grad_norm": 11.041115760803223, + "learning_rate": 9.727496125850409e-06, + "loss": 2.625, + "step": 7773000 + }, + { + "epoch": 2.4165056647294625, + "grad_norm": 10.252971649169922, + "learning_rate": 9.724905587842295e-06, + "loss": 2.6344, + "step": 7773500 + }, + { + "epoch": 2.4166610970099494, + "grad_norm": 10.145711898803711, + "learning_rate": 9.72231504983418e-06, + "loss": 2.6064, + "step": 7774000 + }, + { + "epoch": 2.416816529290436, + "grad_norm": 15.883262634277344, + "learning_rate": 9.719724511826065e-06, + "loss": 2.6304, + "step": 7774500 + }, + { + "epoch": 2.416971961570923, + "grad_norm": 11.06379222869873, + "learning_rate": 9.717133973817951e-06, + "loss": 2.6041, + "step": 7775000 + }, + { + "epoch": 2.41712739385141, + "grad_norm": 11.240561485290527, + "learning_rate": 9.714543435809836e-06, + "loss": 2.6246, + "step": 7775500 + }, + { + "epoch": 2.417282826131897, + "grad_norm": 16.952421188354492, + "learning_rate": 9.711952897801722e-06, + "loss": 2.6087, + "step": 7776000 + }, + { + "epoch": 2.4174382584123837, + "grad_norm": 11.228963851928711, + "learning_rate": 9.709362359793607e-06, + "loss": 2.6219, + "step": 7776500 + }, + { + "epoch": 2.4175936906928706, + "grad_norm": 9.862689018249512, + "learning_rate": 9.706771821785493e-06, + "loss": 2.6246, + "step": 7777000 + }, + { + "epoch": 2.4177491229733574, + "grad_norm": 13.303367614746094, + "learning_rate": 9.704181283777378e-06, + "loss": 2.6146, + "step": 7777500 + }, + { + "epoch": 2.4179045552538443, + "grad_norm": 8.774214744567871, + "learning_rate": 9.701590745769264e-06, + "loss": 2.6327, + "step": 7778000 + }, + { + "epoch": 2.418059987534331, + "grad_norm": 12.621236801147461, + "learning_rate": 9.699000207761149e-06, + "loss": 2.5748, + "step": 7778500 + }, + { + "epoch": 2.418215419814818, + "grad_norm": 9.042654991149902, + "learning_rate": 9.696409669753035e-06, + "loss": 2.5788, + "step": 7779000 + }, + { + "epoch": 2.418370852095305, + "grad_norm": 9.161460876464844, + "learning_rate": 9.693819131744918e-06, + "loss": 2.6072, + "step": 7779500 + }, + { + "epoch": 2.4185262843757918, + "grad_norm": 25.902917861938477, + "learning_rate": 9.691228593736806e-06, + "loss": 2.5825, + "step": 7780000 + }, + { + "epoch": 2.4186817166562786, + "grad_norm": 11.510024070739746, + "learning_rate": 9.688638055728691e-06, + "loss": 2.5725, + "step": 7780500 + }, + { + "epoch": 2.4188371489367655, + "grad_norm": 10.28022289276123, + "learning_rate": 9.686047517720576e-06, + "loss": 2.6028, + "step": 7781000 + }, + { + "epoch": 2.4189925812172524, + "grad_norm": 10.049308776855469, + "learning_rate": 9.683456979712462e-06, + "loss": 2.6025, + "step": 7781500 + }, + { + "epoch": 2.4191480134977392, + "grad_norm": 10.78012752532959, + "learning_rate": 9.680866441704346e-06, + "loss": 2.6168, + "step": 7782000 + }, + { + "epoch": 2.419303445778226, + "grad_norm": 8.846877098083496, + "learning_rate": 9.678275903696233e-06, + "loss": 2.6164, + "step": 7782500 + }, + { + "epoch": 2.419458878058713, + "grad_norm": 10.421712875366211, + "learning_rate": 9.675685365688117e-06, + "loss": 2.6266, + "step": 7783000 + }, + { + "epoch": 2.4196143103392, + "grad_norm": 14.925328254699707, + "learning_rate": 9.673094827680004e-06, + "loss": 2.6363, + "step": 7783500 + }, + { + "epoch": 2.4197697426196867, + "grad_norm": 9.616791725158691, + "learning_rate": 9.670504289671887e-06, + "loss": 2.6351, + "step": 7784000 + }, + { + "epoch": 2.4199251749001736, + "grad_norm": 10.231961250305176, + "learning_rate": 9.667913751663773e-06, + "loss": 2.5964, + "step": 7784500 + }, + { + "epoch": 2.4200806071806604, + "grad_norm": 8.975540161132812, + "learning_rate": 9.66532321365566e-06, + "loss": 2.6519, + "step": 7785000 + }, + { + "epoch": 2.4202360394611473, + "grad_norm": 9.67665958404541, + "learning_rate": 9.662732675647544e-06, + "loss": 2.5983, + "step": 7785500 + }, + { + "epoch": 2.420391471741634, + "grad_norm": 8.582395553588867, + "learning_rate": 9.660142137639431e-06, + "loss": 2.5807, + "step": 7786000 + }, + { + "epoch": 2.420546904022121, + "grad_norm": 16.015226364135742, + "learning_rate": 9.657551599631315e-06, + "loss": 2.6116, + "step": 7786500 + }, + { + "epoch": 2.420702336302608, + "grad_norm": 14.272035598754883, + "learning_rate": 9.6549610616232e-06, + "loss": 2.6394, + "step": 7787000 + }, + { + "epoch": 2.4208577685830948, + "grad_norm": 8.854761123657227, + "learning_rate": 9.652370523615086e-06, + "loss": 2.6128, + "step": 7787500 + }, + { + "epoch": 2.4210132008635816, + "grad_norm": 9.035273551940918, + "learning_rate": 9.649779985606971e-06, + "loss": 2.6839, + "step": 7788000 + }, + { + "epoch": 2.4211686331440685, + "grad_norm": 10.62313461303711, + "learning_rate": 9.647189447598857e-06, + "loss": 2.6377, + "step": 7788500 + }, + { + "epoch": 2.4213240654245554, + "grad_norm": 12.246312141418457, + "learning_rate": 9.644598909590742e-06, + "loss": 2.6543, + "step": 7789000 + }, + { + "epoch": 2.4214794977050422, + "grad_norm": 8.072958946228027, + "learning_rate": 9.642008371582628e-06, + "loss": 2.6067, + "step": 7789500 + }, + { + "epoch": 2.421634929985529, + "grad_norm": 33.525142669677734, + "learning_rate": 9.639417833574513e-06, + "loss": 2.6382, + "step": 7790000 + }, + { + "epoch": 2.421790362266016, + "grad_norm": 8.84950065612793, + "learning_rate": 9.636827295566398e-06, + "loss": 2.617, + "step": 7790500 + }, + { + "epoch": 2.421945794546503, + "grad_norm": 16.425949096679688, + "learning_rate": 9.634236757558284e-06, + "loss": 2.6214, + "step": 7791000 + }, + { + "epoch": 2.4221012268269897, + "grad_norm": 11.14875602722168, + "learning_rate": 9.63164621955017e-06, + "loss": 2.5938, + "step": 7791500 + }, + { + "epoch": 2.422256659107477, + "grad_norm": 9.842571258544922, + "learning_rate": 9.629055681542055e-06, + "loss": 2.6245, + "step": 7792000 + }, + { + "epoch": 2.4224120913879634, + "grad_norm": 11.605928421020508, + "learning_rate": 9.62646514353394e-06, + "loss": 2.5592, + "step": 7792500 + }, + { + "epoch": 2.4225675236684507, + "grad_norm": 9.869601249694824, + "learning_rate": 9.623874605525824e-06, + "loss": 2.6113, + "step": 7793000 + }, + { + "epoch": 2.422722955948937, + "grad_norm": 13.588902473449707, + "learning_rate": 9.621284067517711e-06, + "loss": 2.6267, + "step": 7793500 + }, + { + "epoch": 2.4228783882294245, + "grad_norm": 12.88344955444336, + "learning_rate": 9.618693529509597e-06, + "loss": 2.5991, + "step": 7794000 + }, + { + "epoch": 2.423033820509911, + "grad_norm": 8.861879348754883, + "learning_rate": 9.616102991501482e-06, + "loss": 2.6051, + "step": 7794500 + }, + { + "epoch": 2.423189252790398, + "grad_norm": 13.121355056762695, + "learning_rate": 9.613512453493368e-06, + "loss": 2.5923, + "step": 7795000 + }, + { + "epoch": 2.423344685070885, + "grad_norm": 11.028985977172852, + "learning_rate": 9.610921915485251e-06, + "loss": 2.6126, + "step": 7795500 + }, + { + "epoch": 2.423500117351372, + "grad_norm": 10.040597915649414, + "learning_rate": 9.608331377477139e-06, + "loss": 2.5814, + "step": 7796000 + }, + { + "epoch": 2.423655549631859, + "grad_norm": 8.782320976257324, + "learning_rate": 9.605740839469022e-06, + "loss": 2.6473, + "step": 7796500 + }, + { + "epoch": 2.4238109819123457, + "grad_norm": 12.70631217956543, + "learning_rate": 9.60315030146091e-06, + "loss": 2.6563, + "step": 7797000 + }, + { + "epoch": 2.4239664141928325, + "grad_norm": 9.41157341003418, + "learning_rate": 9.600559763452793e-06, + "loss": 2.5813, + "step": 7797500 + }, + { + "epoch": 2.4241218464733194, + "grad_norm": 8.494071006774902, + "learning_rate": 9.597969225444679e-06, + "loss": 2.617, + "step": 7798000 + }, + { + "epoch": 2.4242772787538063, + "grad_norm": 11.844693183898926, + "learning_rate": 9.595378687436566e-06, + "loss": 2.5894, + "step": 7798500 + }, + { + "epoch": 2.424432711034293, + "grad_norm": 10.172286987304688, + "learning_rate": 9.59278814942845e-06, + "loss": 2.6261, + "step": 7799000 + }, + { + "epoch": 2.42458814331478, + "grad_norm": 9.415351867675781, + "learning_rate": 9.590197611420337e-06, + "loss": 2.6032, + "step": 7799500 + }, + { + "epoch": 2.424743575595267, + "grad_norm": 11.542864799499512, + "learning_rate": 9.58760707341222e-06, + "loss": 2.5928, + "step": 7800000 + }, + { + "epoch": 2.4248990078757537, + "grad_norm": 8.562819480895996, + "learning_rate": 9.585016535404106e-06, + "loss": 2.6496, + "step": 7800500 + }, + { + "epoch": 2.4250544401562406, + "grad_norm": 58.517032623291016, + "learning_rate": 9.582425997395991e-06, + "loss": 2.6013, + "step": 7801000 + }, + { + "epoch": 2.4252098724367275, + "grad_norm": 9.052206993103027, + "learning_rate": 9.579835459387877e-06, + "loss": 2.5559, + "step": 7801500 + }, + { + "epoch": 2.4253653047172143, + "grad_norm": 10.1763277053833, + "learning_rate": 9.577244921379762e-06, + "loss": 2.6315, + "step": 7802000 + }, + { + "epoch": 2.425520736997701, + "grad_norm": 10.24562931060791, + "learning_rate": 9.574654383371648e-06, + "loss": 2.5858, + "step": 7802500 + }, + { + "epoch": 2.425676169278188, + "grad_norm": 8.083093643188477, + "learning_rate": 9.572063845363533e-06, + "loss": 2.6534, + "step": 7803000 + }, + { + "epoch": 2.425831601558675, + "grad_norm": 13.010343551635742, + "learning_rate": 9.569473307355419e-06, + "loss": 2.6375, + "step": 7803500 + }, + { + "epoch": 2.425987033839162, + "grad_norm": 8.912192344665527, + "learning_rate": 9.566882769347304e-06, + "loss": 2.623, + "step": 7804000 + }, + { + "epoch": 2.4261424661196487, + "grad_norm": 14.37868881225586, + "learning_rate": 9.56429223133919e-06, + "loss": 2.6381, + "step": 7804500 + }, + { + "epoch": 2.4262978984001355, + "grad_norm": 8.617725372314453, + "learning_rate": 9.561701693331075e-06, + "loss": 2.6125, + "step": 7805000 + }, + { + "epoch": 2.4264533306806224, + "grad_norm": 13.942831993103027, + "learning_rate": 9.55911115532296e-06, + "loss": 2.6338, + "step": 7805500 + }, + { + "epoch": 2.4266087629611093, + "grad_norm": 12.1165771484375, + "learning_rate": 9.556520617314846e-06, + "loss": 2.6226, + "step": 7806000 + }, + { + "epoch": 2.426764195241596, + "grad_norm": 10.511059761047363, + "learning_rate": 9.55393007930673e-06, + "loss": 2.6452, + "step": 7806500 + }, + { + "epoch": 2.426919627522083, + "grad_norm": 19.403501510620117, + "learning_rate": 9.551339541298617e-06, + "loss": 2.6193, + "step": 7807000 + }, + { + "epoch": 2.42707505980257, + "grad_norm": 22.146785736083984, + "learning_rate": 9.548749003290502e-06, + "loss": 2.6062, + "step": 7807500 + }, + { + "epoch": 2.4272304920830567, + "grad_norm": 7.8073859214782715, + "learning_rate": 9.546158465282388e-06, + "loss": 2.6465, + "step": 7808000 + }, + { + "epoch": 2.4273859243635436, + "grad_norm": 12.364564895629883, + "learning_rate": 9.543567927274273e-06, + "loss": 2.5907, + "step": 7808500 + }, + { + "epoch": 2.4275413566440305, + "grad_norm": 15.588170051574707, + "learning_rate": 9.540977389266157e-06, + "loss": 2.6181, + "step": 7809000 + }, + { + "epoch": 2.4276967889245173, + "grad_norm": 11.145054817199707, + "learning_rate": 9.538386851258044e-06, + "loss": 2.6676, + "step": 7809500 + }, + { + "epoch": 2.427852221205004, + "grad_norm": 8.958126068115234, + "learning_rate": 9.535796313249928e-06, + "loss": 2.5868, + "step": 7810000 + }, + { + "epoch": 2.428007653485491, + "grad_norm": 13.360427856445312, + "learning_rate": 9.533205775241815e-06, + "loss": 2.6334, + "step": 7810500 + }, + { + "epoch": 2.428163085765978, + "grad_norm": 11.605876922607422, + "learning_rate": 9.530615237233699e-06, + "loss": 2.6342, + "step": 7811000 + }, + { + "epoch": 2.428318518046465, + "grad_norm": 7.6719651222229, + "learning_rate": 9.528024699225584e-06, + "loss": 2.6284, + "step": 7811500 + }, + { + "epoch": 2.4284739503269517, + "grad_norm": 14.874713897705078, + "learning_rate": 9.525434161217472e-06, + "loss": 2.6061, + "step": 7812000 + }, + { + "epoch": 2.4286293826074385, + "grad_norm": 13.432047843933105, + "learning_rate": 9.522843623209355e-06, + "loss": 2.6856, + "step": 7812500 + }, + { + "epoch": 2.4287848148879254, + "grad_norm": 10.492753028869629, + "learning_rate": 9.520253085201242e-06, + "loss": 2.6381, + "step": 7813000 + }, + { + "epoch": 2.4289402471684123, + "grad_norm": 8.865194320678711, + "learning_rate": 9.517662547193126e-06, + "loss": 2.6628, + "step": 7813500 + }, + { + "epoch": 2.429095679448899, + "grad_norm": 8.679743766784668, + "learning_rate": 9.515072009185012e-06, + "loss": 2.6366, + "step": 7814000 + }, + { + "epoch": 2.429251111729386, + "grad_norm": 9.652034759521484, + "learning_rate": 9.512481471176897e-06, + "loss": 2.6143, + "step": 7814500 + }, + { + "epoch": 2.429406544009873, + "grad_norm": 12.2560396194458, + "learning_rate": 9.509890933168783e-06, + "loss": 2.6438, + "step": 7815000 + }, + { + "epoch": 2.4295619762903597, + "grad_norm": 10.267603874206543, + "learning_rate": 9.507300395160668e-06, + "loss": 2.5978, + "step": 7815500 + }, + { + "epoch": 2.4297174085708466, + "grad_norm": 10.147416114807129, + "learning_rate": 9.504709857152553e-06, + "loss": 2.573, + "step": 7816000 + }, + { + "epoch": 2.429872840851334, + "grad_norm": 21.320363998413086, + "learning_rate": 9.502119319144439e-06, + "loss": 2.6224, + "step": 7816500 + }, + { + "epoch": 2.4300282731318203, + "grad_norm": 9.200718879699707, + "learning_rate": 9.499528781136324e-06, + "loss": 2.6429, + "step": 7817000 + }, + { + "epoch": 2.4301837054123077, + "grad_norm": 6.729416370391846, + "learning_rate": 9.49693824312821e-06, + "loss": 2.5955, + "step": 7817500 + }, + { + "epoch": 2.430339137692794, + "grad_norm": 11.424772262573242, + "learning_rate": 9.494347705120095e-06, + "loss": 2.6261, + "step": 7818000 + }, + { + "epoch": 2.4304945699732814, + "grad_norm": 16.713634490966797, + "learning_rate": 9.49175716711198e-06, + "loss": 2.6564, + "step": 7818500 + }, + { + "epoch": 2.4306500022537683, + "grad_norm": 9.353818893432617, + "learning_rate": 9.489166629103866e-06, + "loss": 2.5931, + "step": 7819000 + }, + { + "epoch": 2.430805434534255, + "grad_norm": 18.371843338012695, + "learning_rate": 9.486576091095752e-06, + "loss": 2.6382, + "step": 7819500 + }, + { + "epoch": 2.430960866814742, + "grad_norm": 10.394922256469727, + "learning_rate": 9.483985553087635e-06, + "loss": 2.6292, + "step": 7820000 + }, + { + "epoch": 2.431116299095229, + "grad_norm": 9.871240615844727, + "learning_rate": 9.481395015079523e-06, + "loss": 2.6702, + "step": 7820500 + }, + { + "epoch": 2.4312717313757157, + "grad_norm": 16.989524841308594, + "learning_rate": 9.478804477071408e-06, + "loss": 2.6387, + "step": 7821000 + }, + { + "epoch": 2.4314271636562026, + "grad_norm": 9.456696510314941, + "learning_rate": 9.476213939063294e-06, + "loss": 2.6263, + "step": 7821500 + }, + { + "epoch": 2.4315825959366895, + "grad_norm": 11.147746086120605, + "learning_rate": 9.473623401055179e-06, + "loss": 2.6387, + "step": 7822000 + }, + { + "epoch": 2.4317380282171763, + "grad_norm": 11.266227722167969, + "learning_rate": 9.471032863047063e-06, + "loss": 2.6738, + "step": 7822500 + }, + { + "epoch": 2.431893460497663, + "grad_norm": 18.942392349243164, + "learning_rate": 9.46844232503895e-06, + "loss": 2.5966, + "step": 7823000 + }, + { + "epoch": 2.43204889277815, + "grad_norm": 9.790071487426758, + "learning_rate": 9.465851787030834e-06, + "loss": 2.6098, + "step": 7823500 + }, + { + "epoch": 2.432204325058637, + "grad_norm": 7.634706020355225, + "learning_rate": 9.46326124902272e-06, + "loss": 2.6524, + "step": 7824000 + }, + { + "epoch": 2.432359757339124, + "grad_norm": 10.677486419677734, + "learning_rate": 9.460670711014605e-06, + "loss": 2.5855, + "step": 7824500 + }, + { + "epoch": 2.4325151896196107, + "grad_norm": 8.633810997009277, + "learning_rate": 9.45808017300649e-06, + "loss": 2.6318, + "step": 7825000 + }, + { + "epoch": 2.4326706219000975, + "grad_norm": 9.518287658691406, + "learning_rate": 9.455489634998377e-06, + "loss": 2.5679, + "step": 7825500 + }, + { + "epoch": 2.4328260541805844, + "grad_norm": 11.298257827758789, + "learning_rate": 9.452899096990261e-06, + "loss": 2.6137, + "step": 7826000 + }, + { + "epoch": 2.4329814864610713, + "grad_norm": 12.30610466003418, + "learning_rate": 9.450308558982148e-06, + "loss": 2.6543, + "step": 7826500 + }, + { + "epoch": 2.433136918741558, + "grad_norm": 11.659531593322754, + "learning_rate": 9.447718020974032e-06, + "loss": 2.6037, + "step": 7827000 + }, + { + "epoch": 2.433292351022045, + "grad_norm": 8.92645263671875, + "learning_rate": 9.445127482965917e-06, + "loss": 2.6092, + "step": 7827500 + }, + { + "epoch": 2.433447783302532, + "grad_norm": 10.053767204284668, + "learning_rate": 9.442536944957803e-06, + "loss": 2.5907, + "step": 7828000 + }, + { + "epoch": 2.4336032155830187, + "grad_norm": 10.385760307312012, + "learning_rate": 9.439946406949688e-06, + "loss": 2.6166, + "step": 7828500 + }, + { + "epoch": 2.4337586478635056, + "grad_norm": 7.284598350524902, + "learning_rate": 9.437355868941574e-06, + "loss": 2.6316, + "step": 7829000 + }, + { + "epoch": 2.4339140801439925, + "grad_norm": 9.255459785461426, + "learning_rate": 9.43476533093346e-06, + "loss": 2.6724, + "step": 7829500 + }, + { + "epoch": 2.4340695124244793, + "grad_norm": 7.123172760009766, + "learning_rate": 9.432174792925345e-06, + "loss": 2.6304, + "step": 7830000 + }, + { + "epoch": 2.434224944704966, + "grad_norm": 10.449542045593262, + "learning_rate": 9.42958425491723e-06, + "loss": 2.6404, + "step": 7830500 + }, + { + "epoch": 2.434380376985453, + "grad_norm": 53.25800704956055, + "learning_rate": 9.426993716909116e-06, + "loss": 2.6247, + "step": 7831000 + }, + { + "epoch": 2.43453580926594, + "grad_norm": 9.66358757019043, + "learning_rate": 9.424403178901001e-06, + "loss": 2.5999, + "step": 7831500 + }, + { + "epoch": 2.434691241546427, + "grad_norm": 10.623598098754883, + "learning_rate": 9.421812640892886e-06, + "loss": 2.6105, + "step": 7832000 + }, + { + "epoch": 2.4348466738269137, + "grad_norm": 9.195442199707031, + "learning_rate": 9.419222102884772e-06, + "loss": 2.5774, + "step": 7832500 + }, + { + "epoch": 2.4350021061074005, + "grad_norm": 10.409482955932617, + "learning_rate": 9.416631564876657e-06, + "loss": 2.5854, + "step": 7833000 + }, + { + "epoch": 2.4351575383878874, + "grad_norm": 10.247270584106445, + "learning_rate": 9.414041026868541e-06, + "loss": 2.6352, + "step": 7833500 + }, + { + "epoch": 2.4353129706683743, + "grad_norm": 9.972375869750977, + "learning_rate": 9.411450488860428e-06, + "loss": 2.6199, + "step": 7834000 + }, + { + "epoch": 2.435468402948861, + "grad_norm": 8.422740936279297, + "learning_rate": 9.408859950852314e-06, + "loss": 2.5629, + "step": 7834500 + }, + { + "epoch": 2.435623835229348, + "grad_norm": 6.669712066650391, + "learning_rate": 9.4062694128442e-06, + "loss": 2.6175, + "step": 7835000 + }, + { + "epoch": 2.435779267509835, + "grad_norm": 9.422739028930664, + "learning_rate": 9.403678874836085e-06, + "loss": 2.6251, + "step": 7835500 + }, + { + "epoch": 2.4359346997903217, + "grad_norm": 11.435242652893066, + "learning_rate": 9.401088336827968e-06, + "loss": 2.5538, + "step": 7836000 + }, + { + "epoch": 2.4360901320708086, + "grad_norm": 10.030596733093262, + "learning_rate": 9.398497798819856e-06, + "loss": 2.6149, + "step": 7836500 + }, + { + "epoch": 2.4362455643512955, + "grad_norm": 9.193031311035156, + "learning_rate": 9.39590726081174e-06, + "loss": 2.6247, + "step": 7837000 + }, + { + "epoch": 2.4364009966317823, + "grad_norm": 10.478922843933105, + "learning_rate": 9.393316722803627e-06, + "loss": 2.6341, + "step": 7837500 + }, + { + "epoch": 2.436556428912269, + "grad_norm": 9.4521484375, + "learning_rate": 9.39072618479551e-06, + "loss": 2.5672, + "step": 7838000 + }, + { + "epoch": 2.436711861192756, + "grad_norm": 9.103076934814453, + "learning_rate": 9.388135646787396e-06, + "loss": 2.6433, + "step": 7838500 + }, + { + "epoch": 2.436867293473243, + "grad_norm": 13.923121452331543, + "learning_rate": 9.385545108779283e-06, + "loss": 2.5973, + "step": 7839000 + }, + { + "epoch": 2.43702272575373, + "grad_norm": 11.183753967285156, + "learning_rate": 9.382954570771167e-06, + "loss": 2.5953, + "step": 7839500 + }, + { + "epoch": 2.4371781580342167, + "grad_norm": 10.071633338928223, + "learning_rate": 9.380364032763054e-06, + "loss": 2.5931, + "step": 7840000 + }, + { + "epoch": 2.4373335903147035, + "grad_norm": 10.077082633972168, + "learning_rate": 9.377773494754938e-06, + "loss": 2.6202, + "step": 7840500 + }, + { + "epoch": 2.437489022595191, + "grad_norm": 9.046985626220703, + "learning_rate": 9.375182956746823e-06, + "loss": 2.6157, + "step": 7841000 + }, + { + "epoch": 2.4376444548756773, + "grad_norm": 13.207377433776855, + "learning_rate": 9.372592418738709e-06, + "loss": 2.618, + "step": 7841500 + }, + { + "epoch": 2.4377998871561646, + "grad_norm": 8.819815635681152, + "learning_rate": 9.370001880730594e-06, + "loss": 2.6081, + "step": 7842000 + }, + { + "epoch": 2.437955319436651, + "grad_norm": 9.488791465759277, + "learning_rate": 9.36741134272248e-06, + "loss": 2.6337, + "step": 7842500 + }, + { + "epoch": 2.4381107517171383, + "grad_norm": 42.578006744384766, + "learning_rate": 9.364820804714365e-06, + "loss": 2.6589, + "step": 7843000 + }, + { + "epoch": 2.438266183997625, + "grad_norm": 10.657855033874512, + "learning_rate": 9.36223026670625e-06, + "loss": 2.6326, + "step": 7843500 + }, + { + "epoch": 2.438421616278112, + "grad_norm": 9.534990310668945, + "learning_rate": 9.359639728698136e-06, + "loss": 2.5865, + "step": 7844000 + }, + { + "epoch": 2.438577048558599, + "grad_norm": 8.327042579650879, + "learning_rate": 9.357049190690021e-06, + "loss": 2.6201, + "step": 7844500 + }, + { + "epoch": 2.438732480839086, + "grad_norm": 8.643030166625977, + "learning_rate": 9.354458652681907e-06, + "loss": 2.6164, + "step": 7845000 + }, + { + "epoch": 2.4388879131195726, + "grad_norm": 8.174234390258789, + "learning_rate": 9.351868114673792e-06, + "loss": 2.6175, + "step": 7845500 + }, + { + "epoch": 2.4390433454000595, + "grad_norm": 10.571571350097656, + "learning_rate": 9.349277576665678e-06, + "loss": 2.6098, + "step": 7846000 + }, + { + "epoch": 2.4391987776805464, + "grad_norm": 8.903958320617676, + "learning_rate": 9.346687038657563e-06, + "loss": 2.6362, + "step": 7846500 + }, + { + "epoch": 2.4393542099610332, + "grad_norm": 13.617655754089355, + "learning_rate": 9.344096500649447e-06, + "loss": 2.6222, + "step": 7847000 + }, + { + "epoch": 2.43950964224152, + "grad_norm": 9.112221717834473, + "learning_rate": 9.341505962641334e-06, + "loss": 2.6508, + "step": 7847500 + }, + { + "epoch": 2.439665074522007, + "grad_norm": 8.636855125427246, + "learning_rate": 9.33891542463322e-06, + "loss": 2.6591, + "step": 7848000 + }, + { + "epoch": 2.439820506802494, + "grad_norm": 9.893682479858398, + "learning_rate": 9.336324886625105e-06, + "loss": 2.5995, + "step": 7848500 + }, + { + "epoch": 2.4399759390829807, + "grad_norm": 10.241997718811035, + "learning_rate": 9.33373434861699e-06, + "loss": 2.601, + "step": 7849000 + }, + { + "epoch": 2.4401313713634676, + "grad_norm": 9.436676979064941, + "learning_rate": 9.331143810608874e-06, + "loss": 2.6139, + "step": 7849500 + }, + { + "epoch": 2.4402868036439544, + "grad_norm": 12.321553230285645, + "learning_rate": 9.328553272600761e-06, + "loss": 2.6674, + "step": 7850000 + }, + { + "epoch": 2.4404422359244413, + "grad_norm": 8.259700775146484, + "learning_rate": 9.325962734592645e-06, + "loss": 2.6172, + "step": 7850500 + }, + { + "epoch": 2.440597668204928, + "grad_norm": 11.746125221252441, + "learning_rate": 9.323372196584532e-06, + "loss": 2.6189, + "step": 7851000 + }, + { + "epoch": 2.440753100485415, + "grad_norm": 8.682841300964355, + "learning_rate": 9.320781658576416e-06, + "loss": 2.5781, + "step": 7851500 + }, + { + "epoch": 2.440908532765902, + "grad_norm": 8.79234790802002, + "learning_rate": 9.318191120568301e-06, + "loss": 2.6481, + "step": 7852000 + }, + { + "epoch": 2.441063965046389, + "grad_norm": 13.573744773864746, + "learning_rate": 9.315600582560189e-06, + "loss": 2.6234, + "step": 7852500 + }, + { + "epoch": 2.4412193973268757, + "grad_norm": 9.6123685836792, + "learning_rate": 9.313010044552072e-06, + "loss": 2.6079, + "step": 7853000 + }, + { + "epoch": 2.4413748296073625, + "grad_norm": 8.38227367401123, + "learning_rate": 9.31041950654396e-06, + "loss": 2.5964, + "step": 7853500 + }, + { + "epoch": 2.4415302618878494, + "grad_norm": 7.869183540344238, + "learning_rate": 9.307828968535843e-06, + "loss": 2.6614, + "step": 7854000 + }, + { + "epoch": 2.4416856941683363, + "grad_norm": 20.919212341308594, + "learning_rate": 9.305238430527729e-06, + "loss": 2.6065, + "step": 7854500 + }, + { + "epoch": 2.441841126448823, + "grad_norm": 9.016731262207031, + "learning_rate": 9.302647892519614e-06, + "loss": 2.6416, + "step": 7855000 + }, + { + "epoch": 2.44199655872931, + "grad_norm": 22.21299934387207, + "learning_rate": 9.3000573545115e-06, + "loss": 2.6158, + "step": 7855500 + }, + { + "epoch": 2.442151991009797, + "grad_norm": 11.243205070495605, + "learning_rate": 9.297466816503385e-06, + "loss": 2.6127, + "step": 7856000 + }, + { + "epoch": 2.4423074232902837, + "grad_norm": 8.581342697143555, + "learning_rate": 9.29487627849527e-06, + "loss": 2.6027, + "step": 7856500 + }, + { + "epoch": 2.4424628555707706, + "grad_norm": 11.249610900878906, + "learning_rate": 9.292285740487156e-06, + "loss": 2.5896, + "step": 7857000 + }, + { + "epoch": 2.4426182878512575, + "grad_norm": 23.552946090698242, + "learning_rate": 9.289695202479042e-06, + "loss": 2.6089, + "step": 7857500 + }, + { + "epoch": 2.4427737201317443, + "grad_norm": 9.602962493896484, + "learning_rate": 9.287104664470927e-06, + "loss": 2.5954, + "step": 7858000 + }, + { + "epoch": 2.442929152412231, + "grad_norm": 16.7407283782959, + "learning_rate": 9.284514126462812e-06, + "loss": 2.643, + "step": 7858500 + }, + { + "epoch": 2.443084584692718, + "grad_norm": 9.244669914245605, + "learning_rate": 9.281923588454698e-06, + "loss": 2.6296, + "step": 7859000 + }, + { + "epoch": 2.443240016973205, + "grad_norm": 9.814691543579102, + "learning_rate": 9.279333050446583e-06, + "loss": 2.5996, + "step": 7859500 + }, + { + "epoch": 2.443395449253692, + "grad_norm": 11.430777549743652, + "learning_rate": 9.276742512438469e-06, + "loss": 2.5935, + "step": 7860000 + }, + { + "epoch": 2.4435508815341787, + "grad_norm": 9.736214637756348, + "learning_rate": 9.274151974430354e-06, + "loss": 2.6117, + "step": 7860500 + }, + { + "epoch": 2.4437063138146655, + "grad_norm": 9.430752754211426, + "learning_rate": 9.27156143642224e-06, + "loss": 2.6398, + "step": 7861000 + }, + { + "epoch": 2.4438617460951524, + "grad_norm": 10.908650398254395, + "learning_rate": 9.268970898414125e-06, + "loss": 2.6032, + "step": 7861500 + }, + { + "epoch": 2.4440171783756393, + "grad_norm": 12.60810661315918, + "learning_rate": 9.26638036040601e-06, + "loss": 2.6088, + "step": 7862000 + }, + { + "epoch": 2.444172610656126, + "grad_norm": 11.183643341064453, + "learning_rate": 9.263789822397896e-06, + "loss": 2.6081, + "step": 7862500 + }, + { + "epoch": 2.444328042936613, + "grad_norm": 10.028496742248535, + "learning_rate": 9.261199284389782e-06, + "loss": 2.6, + "step": 7863000 + }, + { + "epoch": 2.4444834752171, + "grad_norm": 11.032176971435547, + "learning_rate": 9.258608746381667e-06, + "loss": 2.6781, + "step": 7863500 + }, + { + "epoch": 2.4446389074975867, + "grad_norm": 9.515079498291016, + "learning_rate": 9.25601820837355e-06, + "loss": 2.5983, + "step": 7864000 + }, + { + "epoch": 2.444794339778074, + "grad_norm": 13.398508071899414, + "learning_rate": 9.253427670365438e-06, + "loss": 2.5895, + "step": 7864500 + }, + { + "epoch": 2.4449497720585605, + "grad_norm": 10.155420303344727, + "learning_rate": 9.250837132357322e-06, + "loss": 2.5869, + "step": 7865000 + }, + { + "epoch": 2.4451052043390478, + "grad_norm": 11.634641647338867, + "learning_rate": 9.248246594349209e-06, + "loss": 2.5632, + "step": 7865500 + }, + { + "epoch": 2.445260636619534, + "grad_norm": 9.904585838317871, + "learning_rate": 9.245656056341094e-06, + "loss": 2.5717, + "step": 7866000 + }, + { + "epoch": 2.4454160689000215, + "grad_norm": 14.57614803314209, + "learning_rate": 9.243065518332978e-06, + "loss": 2.6437, + "step": 7866500 + }, + { + "epoch": 2.445571501180508, + "grad_norm": 13.189314842224121, + "learning_rate": 9.240474980324865e-06, + "loss": 2.6418, + "step": 7867000 + }, + { + "epoch": 2.4457269334609952, + "grad_norm": 18.355913162231445, + "learning_rate": 9.237884442316749e-06, + "loss": 2.6111, + "step": 7867500 + }, + { + "epoch": 2.445882365741482, + "grad_norm": 10.168913841247559, + "learning_rate": 9.235293904308636e-06, + "loss": 2.6125, + "step": 7868000 + }, + { + "epoch": 2.446037798021969, + "grad_norm": 11.410938262939453, + "learning_rate": 9.23270336630052e-06, + "loss": 2.6413, + "step": 7868500 + }, + { + "epoch": 2.446193230302456, + "grad_norm": 18.72329330444336, + "learning_rate": 9.230112828292405e-06, + "loss": 2.6185, + "step": 7869000 + }, + { + "epoch": 2.4463486625829427, + "grad_norm": 9.669730186462402, + "learning_rate": 9.227522290284291e-06, + "loss": 2.5855, + "step": 7869500 + }, + { + "epoch": 2.4465040948634296, + "grad_norm": 10.946808815002441, + "learning_rate": 9.224931752276176e-06, + "loss": 2.5939, + "step": 7870000 + }, + { + "epoch": 2.4466595271439164, + "grad_norm": 21.846296310424805, + "learning_rate": 9.222341214268063e-06, + "loss": 2.6187, + "step": 7870500 + }, + { + "epoch": 2.4468149594244033, + "grad_norm": 11.07868766784668, + "learning_rate": 9.219750676259947e-06, + "loss": 2.6682, + "step": 7871000 + }, + { + "epoch": 2.44697039170489, + "grad_norm": 10.97735595703125, + "learning_rate": 9.217160138251833e-06, + "loss": 2.5546, + "step": 7871500 + }, + { + "epoch": 2.447125823985377, + "grad_norm": 10.235910415649414, + "learning_rate": 9.214569600243718e-06, + "loss": 2.6042, + "step": 7872000 + }, + { + "epoch": 2.447281256265864, + "grad_norm": 11.126251220703125, + "learning_rate": 9.211979062235604e-06, + "loss": 2.6014, + "step": 7872500 + }, + { + "epoch": 2.4474366885463508, + "grad_norm": 11.485074043273926, + "learning_rate": 9.209388524227489e-06, + "loss": 2.5993, + "step": 7873000 + }, + { + "epoch": 2.4475921208268376, + "grad_norm": 17.699785232543945, + "learning_rate": 9.206797986219375e-06, + "loss": 2.6246, + "step": 7873500 + }, + { + "epoch": 2.4477475531073245, + "grad_norm": 10.39564323425293, + "learning_rate": 9.20420744821126e-06, + "loss": 2.6257, + "step": 7874000 + }, + { + "epoch": 2.4479029853878114, + "grad_norm": 8.746068000793457, + "learning_rate": 9.201616910203145e-06, + "loss": 2.6053, + "step": 7874500 + }, + { + "epoch": 2.4480584176682982, + "grad_norm": 8.381499290466309, + "learning_rate": 9.199026372195031e-06, + "loss": 2.649, + "step": 7875000 + }, + { + "epoch": 2.448213849948785, + "grad_norm": 10.69091510772705, + "learning_rate": 9.196435834186916e-06, + "loss": 2.6029, + "step": 7875500 + }, + { + "epoch": 2.448369282229272, + "grad_norm": 9.161779403686523, + "learning_rate": 9.193845296178802e-06, + "loss": 2.6333, + "step": 7876000 + }, + { + "epoch": 2.448524714509759, + "grad_norm": 32.84428787231445, + "learning_rate": 9.191254758170687e-06, + "loss": 2.6438, + "step": 7876500 + }, + { + "epoch": 2.4486801467902457, + "grad_norm": 11.16610050201416, + "learning_rate": 9.188664220162573e-06, + "loss": 2.6168, + "step": 7877000 + }, + { + "epoch": 2.4488355790707326, + "grad_norm": 9.021210670471191, + "learning_rate": 9.186073682154457e-06, + "loss": 2.6289, + "step": 7877500 + }, + { + "epoch": 2.4489910113512194, + "grad_norm": 7.450496196746826, + "learning_rate": 9.183483144146344e-06, + "loss": 2.6097, + "step": 7878000 + }, + { + "epoch": 2.4491464436317063, + "grad_norm": 8.692462921142578, + "learning_rate": 9.180892606138227e-06, + "loss": 2.5805, + "step": 7878500 + }, + { + "epoch": 2.449301875912193, + "grad_norm": 9.514822006225586, + "learning_rate": 9.178302068130115e-06, + "loss": 2.5674, + "step": 7879000 + }, + { + "epoch": 2.44945730819268, + "grad_norm": 20.982574462890625, + "learning_rate": 9.175711530122e-06, + "loss": 2.6485, + "step": 7879500 + }, + { + "epoch": 2.449612740473167, + "grad_norm": 9.783387184143066, + "learning_rate": 9.173120992113884e-06, + "loss": 2.5355, + "step": 7880000 + }, + { + "epoch": 2.4497681727536538, + "grad_norm": 13.89964485168457, + "learning_rate": 9.170530454105771e-06, + "loss": 2.6141, + "step": 7880500 + }, + { + "epoch": 2.4499236050341406, + "grad_norm": 9.531475067138672, + "learning_rate": 9.167939916097655e-06, + "loss": 2.6231, + "step": 7881000 + }, + { + "epoch": 2.4500790373146275, + "grad_norm": 9.37154483795166, + "learning_rate": 9.165349378089542e-06, + "loss": 2.6055, + "step": 7881500 + }, + { + "epoch": 2.4502344695951144, + "grad_norm": 12.640365600585938, + "learning_rate": 9.162758840081426e-06, + "loss": 2.6357, + "step": 7882000 + }, + { + "epoch": 2.4503899018756012, + "grad_norm": 34.791812896728516, + "learning_rate": 9.160168302073311e-06, + "loss": 2.6284, + "step": 7882500 + }, + { + "epoch": 2.450545334156088, + "grad_norm": 8.14860725402832, + "learning_rate": 9.157577764065197e-06, + "loss": 2.6472, + "step": 7883000 + }, + { + "epoch": 2.450700766436575, + "grad_norm": 12.595930099487305, + "learning_rate": 9.154987226057082e-06, + "loss": 2.634, + "step": 7883500 + }, + { + "epoch": 2.450856198717062, + "grad_norm": 10.775525093078613, + "learning_rate": 9.15239668804897e-06, + "loss": 2.5862, + "step": 7884000 + }, + { + "epoch": 2.4510116309975487, + "grad_norm": 6.007548809051514, + "learning_rate": 9.149806150040853e-06, + "loss": 2.597, + "step": 7884500 + }, + { + "epoch": 2.4511670632780356, + "grad_norm": 9.995139122009277, + "learning_rate": 9.147215612032738e-06, + "loss": 2.6402, + "step": 7885000 + }, + { + "epoch": 2.4513224955585224, + "grad_norm": 11.792154312133789, + "learning_rate": 9.144625074024624e-06, + "loss": 2.5721, + "step": 7885500 + }, + { + "epoch": 2.4514779278390093, + "grad_norm": 15.334895133972168, + "learning_rate": 9.14203453601651e-06, + "loss": 2.6594, + "step": 7886000 + }, + { + "epoch": 2.451633360119496, + "grad_norm": 9.975785255432129, + "learning_rate": 9.139443998008395e-06, + "loss": 2.6243, + "step": 7886500 + }, + { + "epoch": 2.451788792399983, + "grad_norm": 13.01883602142334, + "learning_rate": 9.13685346000028e-06, + "loss": 2.5995, + "step": 7887000 + }, + { + "epoch": 2.45194422468047, + "grad_norm": 9.894538879394531, + "learning_rate": 9.134262921992166e-06, + "loss": 2.6007, + "step": 7887500 + }, + { + "epoch": 2.4520996569609568, + "grad_norm": 11.086797714233398, + "learning_rate": 9.131672383984051e-06, + "loss": 2.6203, + "step": 7888000 + }, + { + "epoch": 2.4522550892414436, + "grad_norm": 9.693560600280762, + "learning_rate": 9.129081845975937e-06, + "loss": 2.6263, + "step": 7888500 + }, + { + "epoch": 2.452410521521931, + "grad_norm": 30.472360610961914, + "learning_rate": 9.126491307967822e-06, + "loss": 2.6019, + "step": 7889000 + }, + { + "epoch": 2.4525659538024174, + "grad_norm": 11.010520935058594, + "learning_rate": 9.123900769959708e-06, + "loss": 2.6297, + "step": 7889500 + }, + { + "epoch": 2.4527213860829047, + "grad_norm": 11.908914566040039, + "learning_rate": 9.121310231951593e-06, + "loss": 2.613, + "step": 7890000 + }, + { + "epoch": 2.452876818363391, + "grad_norm": 10.236100196838379, + "learning_rate": 9.118719693943478e-06, + "loss": 2.664, + "step": 7890500 + }, + { + "epoch": 2.4530322506438784, + "grad_norm": 15.493080139160156, + "learning_rate": 9.116129155935362e-06, + "loss": 2.5837, + "step": 7891000 + }, + { + "epoch": 2.4531876829243653, + "grad_norm": 12.80366039276123, + "learning_rate": 9.11353861792725e-06, + "loss": 2.5851, + "step": 7891500 + }, + { + "epoch": 2.453343115204852, + "grad_norm": 13.017190933227539, + "learning_rate": 9.110948079919133e-06, + "loss": 2.635, + "step": 7892000 + }, + { + "epoch": 2.453498547485339, + "grad_norm": 7.970086097717285, + "learning_rate": 9.10835754191102e-06, + "loss": 2.6178, + "step": 7892500 + }, + { + "epoch": 2.453653979765826, + "grad_norm": 18.32652473449707, + "learning_rate": 9.105767003902906e-06, + "loss": 2.6393, + "step": 7893000 + }, + { + "epoch": 2.4538094120463128, + "grad_norm": 9.270302772521973, + "learning_rate": 9.10317646589479e-06, + "loss": 2.625, + "step": 7893500 + }, + { + "epoch": 2.4539648443267996, + "grad_norm": 11.97114086151123, + "learning_rate": 9.100585927886677e-06, + "loss": 2.586, + "step": 7894000 + }, + { + "epoch": 2.4541202766072865, + "grad_norm": 9.236055374145508, + "learning_rate": 9.09799538987856e-06, + "loss": 2.6362, + "step": 7894500 + }, + { + "epoch": 2.4542757088877734, + "grad_norm": 10.81078052520752, + "learning_rate": 9.095404851870448e-06, + "loss": 2.6074, + "step": 7895000 + }, + { + "epoch": 2.45443114116826, + "grad_norm": 41.75994110107422, + "learning_rate": 9.092814313862331e-06, + "loss": 2.6269, + "step": 7895500 + }, + { + "epoch": 2.454586573448747, + "grad_norm": 10.42066478729248, + "learning_rate": 9.090223775854217e-06, + "loss": 2.6095, + "step": 7896000 + }, + { + "epoch": 2.454742005729234, + "grad_norm": 9.422821998596191, + "learning_rate": 9.087633237846102e-06, + "loss": 2.6383, + "step": 7896500 + }, + { + "epoch": 2.454897438009721, + "grad_norm": 8.492581367492676, + "learning_rate": 9.085042699837988e-06, + "loss": 2.6059, + "step": 7897000 + }, + { + "epoch": 2.4550528702902077, + "grad_norm": 8.397444725036621, + "learning_rate": 9.082452161829875e-06, + "loss": 2.6128, + "step": 7897500 + }, + { + "epoch": 2.4552083025706946, + "grad_norm": 11.039247512817383, + "learning_rate": 9.079861623821759e-06, + "loss": 2.6381, + "step": 7898000 + }, + { + "epoch": 2.4553637348511814, + "grad_norm": 8.043654441833496, + "learning_rate": 9.077271085813644e-06, + "loss": 2.6192, + "step": 7898500 + }, + { + "epoch": 2.4555191671316683, + "grad_norm": 10.350722312927246, + "learning_rate": 9.07468054780553e-06, + "loss": 2.6047, + "step": 7899000 + }, + { + "epoch": 2.455674599412155, + "grad_norm": 24.5250301361084, + "learning_rate": 9.072090009797415e-06, + "loss": 2.6085, + "step": 7899500 + }, + { + "epoch": 2.455830031692642, + "grad_norm": 9.495261192321777, + "learning_rate": 9.0694994717893e-06, + "loss": 2.6205, + "step": 7900000 + }, + { + "epoch": 2.455985463973129, + "grad_norm": 8.21374797821045, + "learning_rate": 9.066908933781186e-06, + "loss": 2.6453, + "step": 7900500 + }, + { + "epoch": 2.4561408962536158, + "grad_norm": 17.108938217163086, + "learning_rate": 9.064318395773071e-06, + "loss": 2.6943, + "step": 7901000 + }, + { + "epoch": 2.4562963285341026, + "grad_norm": 30.42352867126465, + "learning_rate": 9.061727857764957e-06, + "loss": 2.6084, + "step": 7901500 + }, + { + "epoch": 2.4564517608145895, + "grad_norm": 10.635549545288086, + "learning_rate": 9.059137319756842e-06, + "loss": 2.6298, + "step": 7902000 + }, + { + "epoch": 2.4566071930950764, + "grad_norm": 8.498621940612793, + "learning_rate": 9.056546781748728e-06, + "loss": 2.5661, + "step": 7902500 + }, + { + "epoch": 2.4567626253755632, + "grad_norm": 11.933711051940918, + "learning_rate": 9.053956243740613e-06, + "loss": 2.6002, + "step": 7903000 + }, + { + "epoch": 2.45691805765605, + "grad_norm": 8.962754249572754, + "learning_rate": 9.051365705732499e-06, + "loss": 2.5956, + "step": 7903500 + }, + { + "epoch": 2.457073489936537, + "grad_norm": 49.34396743774414, + "learning_rate": 9.048775167724384e-06, + "loss": 2.6224, + "step": 7904000 + }, + { + "epoch": 2.457228922217024, + "grad_norm": 10.135553359985352, + "learning_rate": 9.046184629716268e-06, + "loss": 2.5798, + "step": 7904500 + }, + { + "epoch": 2.4573843544975107, + "grad_norm": 10.795137405395508, + "learning_rate": 9.043594091708155e-06, + "loss": 2.5997, + "step": 7905000 + }, + { + "epoch": 2.4575397867779976, + "grad_norm": 35.101707458496094, + "learning_rate": 9.04100355370004e-06, + "loss": 2.5763, + "step": 7905500 + }, + { + "epoch": 2.4576952190584844, + "grad_norm": 8.430978775024414, + "learning_rate": 9.038413015691926e-06, + "loss": 2.6466, + "step": 7906000 + }, + { + "epoch": 2.4578506513389713, + "grad_norm": 10.414647102355957, + "learning_rate": 9.035822477683811e-06, + "loss": 2.6509, + "step": 7906500 + }, + { + "epoch": 2.458006083619458, + "grad_norm": 28.690887451171875, + "learning_rate": 9.033231939675695e-06, + "loss": 2.584, + "step": 7907000 + }, + { + "epoch": 2.458161515899945, + "grad_norm": 9.330820083618164, + "learning_rate": 9.030641401667582e-06, + "loss": 2.5989, + "step": 7907500 + }, + { + "epoch": 2.458316948180432, + "grad_norm": 8.350861549377441, + "learning_rate": 9.028050863659466e-06, + "loss": 2.6283, + "step": 7908000 + }, + { + "epoch": 2.4584723804609188, + "grad_norm": 10.91111946105957, + "learning_rate": 9.025460325651353e-06, + "loss": 2.6703, + "step": 7908500 + }, + { + "epoch": 2.4586278127414056, + "grad_norm": 8.972576141357422, + "learning_rate": 9.022869787643237e-06, + "loss": 2.6345, + "step": 7909000 + }, + { + "epoch": 2.4587832450218925, + "grad_norm": 13.101102828979492, + "learning_rate": 9.020279249635123e-06, + "loss": 2.6621, + "step": 7909500 + }, + { + "epoch": 2.4589386773023794, + "grad_norm": 7.460075378417969, + "learning_rate": 9.01768871162701e-06, + "loss": 2.6388, + "step": 7910000 + }, + { + "epoch": 2.4590941095828662, + "grad_norm": 8.681485176086426, + "learning_rate": 9.015098173618893e-06, + "loss": 2.6443, + "step": 7910500 + }, + { + "epoch": 2.459249541863353, + "grad_norm": 9.294861793518066, + "learning_rate": 9.01250763561078e-06, + "loss": 2.582, + "step": 7911000 + }, + { + "epoch": 2.45940497414384, + "grad_norm": 9.620014190673828, + "learning_rate": 9.009917097602664e-06, + "loss": 2.6125, + "step": 7911500 + }, + { + "epoch": 2.459560406424327, + "grad_norm": 10.073330879211426, + "learning_rate": 9.00732655959455e-06, + "loss": 2.6628, + "step": 7912000 + }, + { + "epoch": 2.459715838704814, + "grad_norm": 11.993330001831055, + "learning_rate": 9.004736021586435e-06, + "loss": 2.6233, + "step": 7912500 + }, + { + "epoch": 2.4598712709853006, + "grad_norm": 9.261289596557617, + "learning_rate": 9.00214548357832e-06, + "loss": 2.6253, + "step": 7913000 + }, + { + "epoch": 2.460026703265788, + "grad_norm": 10.171540260314941, + "learning_rate": 8.999554945570206e-06, + "loss": 2.5815, + "step": 7913500 + }, + { + "epoch": 2.4601821355462743, + "grad_norm": 9.597672462463379, + "learning_rate": 8.996964407562092e-06, + "loss": 2.6188, + "step": 7914000 + }, + { + "epoch": 2.4603375678267616, + "grad_norm": 10.083234786987305, + "learning_rate": 8.994373869553977e-06, + "loss": 2.6416, + "step": 7914500 + }, + { + "epoch": 2.460493000107248, + "grad_norm": 10.443456649780273, + "learning_rate": 8.991783331545863e-06, + "loss": 2.6199, + "step": 7915000 + }, + { + "epoch": 2.4606484323877353, + "grad_norm": 15.33239459991455, + "learning_rate": 8.989192793537748e-06, + "loss": 2.6462, + "step": 7915500 + }, + { + "epoch": 2.460803864668222, + "grad_norm": 9.512598037719727, + "learning_rate": 8.986602255529633e-06, + "loss": 2.6559, + "step": 7916000 + }, + { + "epoch": 2.460959296948709, + "grad_norm": 8.357075691223145, + "learning_rate": 8.984011717521519e-06, + "loss": 2.5847, + "step": 7916500 + }, + { + "epoch": 2.461114729229196, + "grad_norm": 11.535090446472168, + "learning_rate": 8.981421179513404e-06, + "loss": 2.63, + "step": 7917000 + }, + { + "epoch": 2.461270161509683, + "grad_norm": 9.934530258178711, + "learning_rate": 8.97883064150529e-06, + "loss": 2.6578, + "step": 7917500 + }, + { + "epoch": 2.4614255937901697, + "grad_norm": 8.926055908203125, + "learning_rate": 8.976240103497174e-06, + "loss": 2.6681, + "step": 7918000 + }, + { + "epoch": 2.4615810260706565, + "grad_norm": 10.192058563232422, + "learning_rate": 8.97364956548906e-06, + "loss": 2.6268, + "step": 7918500 + }, + { + "epoch": 2.4617364583511434, + "grad_norm": 8.461067199707031, + "learning_rate": 8.971059027480946e-06, + "loss": 2.6247, + "step": 7919000 + }, + { + "epoch": 2.4618918906316303, + "grad_norm": 8.36262321472168, + "learning_rate": 8.968468489472832e-06, + "loss": 2.6198, + "step": 7919500 + }, + { + "epoch": 2.462047322912117, + "grad_norm": 11.97488021850586, + "learning_rate": 8.965877951464717e-06, + "loss": 2.586, + "step": 7920000 + }, + { + "epoch": 2.462202755192604, + "grad_norm": 12.402533531188965, + "learning_rate": 8.963287413456601e-06, + "loss": 2.605, + "step": 7920500 + }, + { + "epoch": 2.462358187473091, + "grad_norm": 9.080472946166992, + "learning_rate": 8.960696875448488e-06, + "loss": 2.6123, + "step": 7921000 + }, + { + "epoch": 2.4625136197535777, + "grad_norm": 8.83337688446045, + "learning_rate": 8.958106337440372e-06, + "loss": 2.6171, + "step": 7921500 + }, + { + "epoch": 2.4626690520340646, + "grad_norm": 14.641228675842285, + "learning_rate": 8.955515799432259e-06, + "loss": 2.6113, + "step": 7922000 + }, + { + "epoch": 2.4628244843145515, + "grad_norm": 7.918129920959473, + "learning_rate": 8.952925261424143e-06, + "loss": 2.6009, + "step": 7922500 + }, + { + "epoch": 2.4629799165950383, + "grad_norm": 11.85522747039795, + "learning_rate": 8.950334723416028e-06, + "loss": 2.5987, + "step": 7923000 + }, + { + "epoch": 2.463135348875525, + "grad_norm": 8.398701667785645, + "learning_rate": 8.947744185407915e-06, + "loss": 2.6098, + "step": 7923500 + }, + { + "epoch": 2.463290781156012, + "grad_norm": 11.257833480834961, + "learning_rate": 8.945153647399799e-06, + "loss": 2.605, + "step": 7924000 + }, + { + "epoch": 2.463446213436499, + "grad_norm": 11.113020896911621, + "learning_rate": 8.942563109391686e-06, + "loss": 2.6473, + "step": 7924500 + }, + { + "epoch": 2.463601645716986, + "grad_norm": 10.561813354492188, + "learning_rate": 8.93997257138357e-06, + "loss": 2.6236, + "step": 7925000 + }, + { + "epoch": 2.4637570779974727, + "grad_norm": 8.218989372253418, + "learning_rate": 8.937382033375456e-06, + "loss": 2.6231, + "step": 7925500 + }, + { + "epoch": 2.4639125102779595, + "grad_norm": 9.801511764526367, + "learning_rate": 8.934791495367341e-06, + "loss": 2.6109, + "step": 7926000 + }, + { + "epoch": 2.4640679425584464, + "grad_norm": 12.00722885131836, + "learning_rate": 8.932200957359226e-06, + "loss": 2.5747, + "step": 7926500 + }, + { + "epoch": 2.4642233748389333, + "grad_norm": 10.817893981933594, + "learning_rate": 8.929610419351112e-06, + "loss": 2.5412, + "step": 7927000 + }, + { + "epoch": 2.46437880711942, + "grad_norm": 11.267448425292969, + "learning_rate": 8.927019881342997e-06, + "loss": 2.6345, + "step": 7927500 + }, + { + "epoch": 2.464534239399907, + "grad_norm": 8.317378044128418, + "learning_rate": 8.924429343334883e-06, + "loss": 2.6214, + "step": 7928000 + }, + { + "epoch": 2.464689671680394, + "grad_norm": 9.015057563781738, + "learning_rate": 8.921838805326768e-06, + "loss": 2.6244, + "step": 7928500 + }, + { + "epoch": 2.4648451039608807, + "grad_norm": 8.029173851013184, + "learning_rate": 8.919248267318654e-06, + "loss": 2.6029, + "step": 7929000 + }, + { + "epoch": 2.4650005362413676, + "grad_norm": 8.584662437438965, + "learning_rate": 8.91665772931054e-06, + "loss": 2.6516, + "step": 7929500 + }, + { + "epoch": 2.4651559685218545, + "grad_norm": 9.526599884033203, + "learning_rate": 8.914067191302425e-06, + "loss": 2.6374, + "step": 7930000 + }, + { + "epoch": 2.4653114008023413, + "grad_norm": 11.268685340881348, + "learning_rate": 8.91147665329431e-06, + "loss": 2.595, + "step": 7930500 + }, + { + "epoch": 2.465466833082828, + "grad_norm": 9.371817588806152, + "learning_rate": 8.908886115286196e-06, + "loss": 2.5981, + "step": 7931000 + }, + { + "epoch": 2.465622265363315, + "grad_norm": 10.970953941345215, + "learning_rate": 8.90629557727808e-06, + "loss": 2.5846, + "step": 7931500 + }, + { + "epoch": 2.465777697643802, + "grad_norm": 12.561467170715332, + "learning_rate": 8.903705039269966e-06, + "loss": 2.5623, + "step": 7932000 + }, + { + "epoch": 2.465933129924289, + "grad_norm": 9.381128311157227, + "learning_rate": 8.901114501261852e-06, + "loss": 2.5957, + "step": 7932500 + }, + { + "epoch": 2.4660885622047757, + "grad_norm": 9.231942176818848, + "learning_rate": 8.898523963253737e-06, + "loss": 2.5848, + "step": 7933000 + }, + { + "epoch": 2.4662439944852625, + "grad_norm": 9.154808044433594, + "learning_rate": 8.895933425245623e-06, + "loss": 2.6023, + "step": 7933500 + }, + { + "epoch": 2.4663994267657494, + "grad_norm": 15.474614143371582, + "learning_rate": 8.893342887237507e-06, + "loss": 2.6384, + "step": 7934000 + }, + { + "epoch": 2.4665548590462363, + "grad_norm": 15.976749420166016, + "learning_rate": 8.890752349229394e-06, + "loss": 2.6337, + "step": 7934500 + }, + { + "epoch": 2.466710291326723, + "grad_norm": 9.35623550415039, + "learning_rate": 8.888161811221278e-06, + "loss": 2.617, + "step": 7935000 + }, + { + "epoch": 2.46686572360721, + "grad_norm": 11.129786491394043, + "learning_rate": 8.885571273213165e-06, + "loss": 2.6101, + "step": 7935500 + }, + { + "epoch": 2.467021155887697, + "grad_norm": 10.320941925048828, + "learning_rate": 8.882980735205048e-06, + "loss": 2.6127, + "step": 7936000 + }, + { + "epoch": 2.4671765881681837, + "grad_norm": 10.473549842834473, + "learning_rate": 8.880390197196934e-06, + "loss": 2.5973, + "step": 7936500 + }, + { + "epoch": 2.467332020448671, + "grad_norm": 8.99138069152832, + "learning_rate": 8.877799659188821e-06, + "loss": 2.6182, + "step": 7937000 + }, + { + "epoch": 2.4674874527291575, + "grad_norm": 12.349583625793457, + "learning_rate": 8.875209121180705e-06, + "loss": 2.6095, + "step": 7937500 + }, + { + "epoch": 2.467642885009645, + "grad_norm": 8.524978637695312, + "learning_rate": 8.872618583172592e-06, + "loss": 2.5933, + "step": 7938000 + }, + { + "epoch": 2.467798317290131, + "grad_norm": 10.8735990524292, + "learning_rate": 8.870028045164476e-06, + "loss": 2.6529, + "step": 7938500 + }, + { + "epoch": 2.4679537495706185, + "grad_norm": 6.644540786743164, + "learning_rate": 8.867437507156361e-06, + "loss": 2.6158, + "step": 7939000 + }, + { + "epoch": 2.4681091818511054, + "grad_norm": 9.969117164611816, + "learning_rate": 8.864846969148247e-06, + "loss": 2.5899, + "step": 7939500 + }, + { + "epoch": 2.4682646141315923, + "grad_norm": 10.464470863342285, + "learning_rate": 8.862256431140132e-06, + "loss": 2.6069, + "step": 7940000 + }, + { + "epoch": 2.468420046412079, + "grad_norm": 9.504630088806152, + "learning_rate": 8.859665893132018e-06, + "loss": 2.6436, + "step": 7940500 + }, + { + "epoch": 2.468575478692566, + "grad_norm": 33.38409423828125, + "learning_rate": 8.857075355123903e-06, + "loss": 2.5922, + "step": 7941000 + }, + { + "epoch": 2.468730910973053, + "grad_norm": 14.360301971435547, + "learning_rate": 8.854484817115789e-06, + "loss": 2.6888, + "step": 7941500 + }, + { + "epoch": 2.4688863432535397, + "grad_norm": 17.77228546142578, + "learning_rate": 8.851894279107674e-06, + "loss": 2.604, + "step": 7942000 + }, + { + "epoch": 2.4690417755340266, + "grad_norm": 8.667439460754395, + "learning_rate": 8.84930374109956e-06, + "loss": 2.6001, + "step": 7942500 + }, + { + "epoch": 2.4691972078145135, + "grad_norm": 10.59520149230957, + "learning_rate": 8.846713203091445e-06, + "loss": 2.5996, + "step": 7943000 + }, + { + "epoch": 2.4693526400950003, + "grad_norm": 8.937643051147461, + "learning_rate": 8.84412266508333e-06, + "loss": 2.5399, + "step": 7943500 + }, + { + "epoch": 2.469508072375487, + "grad_norm": 9.34200382232666, + "learning_rate": 8.841532127075216e-06, + "loss": 2.6097, + "step": 7944000 + }, + { + "epoch": 2.469663504655974, + "grad_norm": 11.932889938354492, + "learning_rate": 8.838941589067101e-06, + "loss": 2.573, + "step": 7944500 + }, + { + "epoch": 2.469818936936461, + "grad_norm": 18.98653793334961, + "learning_rate": 8.836351051058985e-06, + "loss": 2.6108, + "step": 7945000 + }, + { + "epoch": 2.469974369216948, + "grad_norm": 10.984877586364746, + "learning_rate": 8.833760513050872e-06, + "loss": 2.608, + "step": 7945500 + }, + { + "epoch": 2.4701298014974347, + "grad_norm": 9.512216567993164, + "learning_rate": 8.831169975042758e-06, + "loss": 2.592, + "step": 7946000 + }, + { + "epoch": 2.4702852337779215, + "grad_norm": 9.444130897521973, + "learning_rate": 8.828579437034643e-06, + "loss": 2.6452, + "step": 7946500 + }, + { + "epoch": 2.4704406660584084, + "grad_norm": 10.361021041870117, + "learning_rate": 8.825988899026529e-06, + "loss": 2.6358, + "step": 7947000 + }, + { + "epoch": 2.4705960983388953, + "grad_norm": 9.960209846496582, + "learning_rate": 8.823398361018412e-06, + "loss": 2.6213, + "step": 7947500 + }, + { + "epoch": 2.470751530619382, + "grad_norm": 11.767553329467773, + "learning_rate": 8.8208078230103e-06, + "loss": 2.6313, + "step": 7948000 + }, + { + "epoch": 2.470906962899869, + "grad_norm": 9.609537124633789, + "learning_rate": 8.818217285002183e-06, + "loss": 2.5725, + "step": 7948500 + }, + { + "epoch": 2.471062395180356, + "grad_norm": 10.094651222229004, + "learning_rate": 8.81562674699407e-06, + "loss": 2.6186, + "step": 7949000 + }, + { + "epoch": 2.4712178274608427, + "grad_norm": 16.748062133789062, + "learning_rate": 8.813036208985954e-06, + "loss": 2.5957, + "step": 7949500 + }, + { + "epoch": 2.4713732597413296, + "grad_norm": 11.632482528686523, + "learning_rate": 8.81044567097784e-06, + "loss": 2.6582, + "step": 7950000 + }, + { + "epoch": 2.4715286920218165, + "grad_norm": 8.988479614257812, + "learning_rate": 8.807855132969727e-06, + "loss": 2.6572, + "step": 7950500 + }, + { + "epoch": 2.4716841243023033, + "grad_norm": 10.15392017364502, + "learning_rate": 8.80526459496161e-06, + "loss": 2.5963, + "step": 7951000 + }, + { + "epoch": 2.47183955658279, + "grad_norm": 12.542777061462402, + "learning_rate": 8.802674056953498e-06, + "loss": 2.61, + "step": 7951500 + }, + { + "epoch": 2.471994988863277, + "grad_norm": 9.57697868347168, + "learning_rate": 8.800083518945381e-06, + "loss": 2.6917, + "step": 7952000 + }, + { + "epoch": 2.472150421143764, + "grad_norm": 10.630351066589355, + "learning_rate": 8.797492980937267e-06, + "loss": 2.6263, + "step": 7952500 + }, + { + "epoch": 2.472305853424251, + "grad_norm": 10.71274185180664, + "learning_rate": 8.794902442929152e-06, + "loss": 2.6037, + "step": 7953000 + }, + { + "epoch": 2.4724612857047377, + "grad_norm": 9.528460502624512, + "learning_rate": 8.792311904921038e-06, + "loss": 2.6587, + "step": 7953500 + }, + { + "epoch": 2.4726167179852245, + "grad_norm": 9.341931343078613, + "learning_rate": 8.789721366912923e-06, + "loss": 2.6274, + "step": 7954000 + }, + { + "epoch": 2.4727721502657114, + "grad_norm": 10.592209815979004, + "learning_rate": 8.787130828904809e-06, + "loss": 2.61, + "step": 7954500 + }, + { + "epoch": 2.4729275825461983, + "grad_norm": 10.129079818725586, + "learning_rate": 8.784540290896694e-06, + "loss": 2.5904, + "step": 7955000 + }, + { + "epoch": 2.473083014826685, + "grad_norm": 5.51365327835083, + "learning_rate": 8.78194975288858e-06, + "loss": 2.6131, + "step": 7955500 + }, + { + "epoch": 2.473238447107172, + "grad_norm": 10.214608192443848, + "learning_rate": 8.779359214880465e-06, + "loss": 2.5947, + "step": 7956000 + }, + { + "epoch": 2.473393879387659, + "grad_norm": 12.785385131835938, + "learning_rate": 8.77676867687235e-06, + "loss": 2.6356, + "step": 7956500 + }, + { + "epoch": 2.4735493116681457, + "grad_norm": 9.458023071289062, + "learning_rate": 8.774178138864236e-06, + "loss": 2.6512, + "step": 7957000 + }, + { + "epoch": 2.4737047439486326, + "grad_norm": 12.323092460632324, + "learning_rate": 8.771587600856122e-06, + "loss": 2.6572, + "step": 7957500 + }, + { + "epoch": 2.4738601762291195, + "grad_norm": 10.478311538696289, + "learning_rate": 8.768997062848007e-06, + "loss": 2.618, + "step": 7958000 + }, + { + "epoch": 2.4740156085096063, + "grad_norm": 8.552300453186035, + "learning_rate": 8.766406524839892e-06, + "loss": 2.6018, + "step": 7958500 + }, + { + "epoch": 2.474171040790093, + "grad_norm": 8.893969535827637, + "learning_rate": 8.763815986831778e-06, + "loss": 2.6263, + "step": 7959000 + }, + { + "epoch": 2.47432647307058, + "grad_norm": 10.872102737426758, + "learning_rate": 8.761225448823663e-06, + "loss": 2.6188, + "step": 7959500 + }, + { + "epoch": 2.474481905351067, + "grad_norm": 9.906803131103516, + "learning_rate": 8.758634910815549e-06, + "loss": 2.5869, + "step": 7960000 + }, + { + "epoch": 2.4746373376315542, + "grad_norm": 18.672637939453125, + "learning_rate": 8.756044372807434e-06, + "loss": 2.605, + "step": 7960500 + }, + { + "epoch": 2.4747927699120407, + "grad_norm": 12.712958335876465, + "learning_rate": 8.75345383479932e-06, + "loss": 2.5584, + "step": 7961000 + }, + { + "epoch": 2.474948202192528, + "grad_norm": 9.483352661132812, + "learning_rate": 8.750863296791205e-06, + "loss": 2.6029, + "step": 7961500 + }, + { + "epoch": 2.4751036344730144, + "grad_norm": 9.34206771850586, + "learning_rate": 8.748272758783089e-06, + "loss": 2.6323, + "step": 7962000 + }, + { + "epoch": 2.4752590667535017, + "grad_norm": 10.20322036743164, + "learning_rate": 8.745682220774976e-06, + "loss": 2.6416, + "step": 7962500 + }, + { + "epoch": 2.475414499033988, + "grad_norm": 18.40437126159668, + "learning_rate": 8.74309168276686e-06, + "loss": 2.6139, + "step": 7963000 + }, + { + "epoch": 2.4755699313144754, + "grad_norm": 12.322626113891602, + "learning_rate": 8.740501144758747e-06, + "loss": 2.5998, + "step": 7963500 + }, + { + "epoch": 2.4757253635949623, + "grad_norm": 11.546174049377441, + "learning_rate": 8.737910606750633e-06, + "loss": 2.6105, + "step": 7964000 + }, + { + "epoch": 2.475880795875449, + "grad_norm": 12.841951370239258, + "learning_rate": 8.735320068742516e-06, + "loss": 2.6342, + "step": 7964500 + }, + { + "epoch": 2.476036228155936, + "grad_norm": 12.045690536499023, + "learning_rate": 8.732729530734403e-06, + "loss": 2.6512, + "step": 7965000 + }, + { + "epoch": 2.476191660436423, + "grad_norm": 10.11296558380127, + "learning_rate": 8.730138992726287e-06, + "loss": 2.592, + "step": 7965500 + }, + { + "epoch": 2.47634709271691, + "grad_norm": 10.17322063446045, + "learning_rate": 8.727548454718174e-06, + "loss": 2.6307, + "step": 7966000 + }, + { + "epoch": 2.4765025249973966, + "grad_norm": 8.081123352050781, + "learning_rate": 8.724957916710058e-06, + "loss": 2.6216, + "step": 7966500 + }, + { + "epoch": 2.4766579572778835, + "grad_norm": 10.045135498046875, + "learning_rate": 8.722367378701944e-06, + "loss": 2.6489, + "step": 7967000 + }, + { + "epoch": 2.4768133895583704, + "grad_norm": 9.299391746520996, + "learning_rate": 8.719776840693829e-06, + "loss": 2.6154, + "step": 7967500 + }, + { + "epoch": 2.4769688218388572, + "grad_norm": 9.20514965057373, + "learning_rate": 8.717186302685714e-06, + "loss": 2.6565, + "step": 7968000 + }, + { + "epoch": 2.477124254119344, + "grad_norm": 11.591693878173828, + "learning_rate": 8.714595764677602e-06, + "loss": 2.6241, + "step": 7968500 + }, + { + "epoch": 2.477279686399831, + "grad_norm": 9.65050220489502, + "learning_rate": 8.712005226669485e-06, + "loss": 2.5889, + "step": 7969000 + }, + { + "epoch": 2.477435118680318, + "grad_norm": 11.668248176574707, + "learning_rate": 8.709414688661371e-06, + "loss": 2.6161, + "step": 7969500 + }, + { + "epoch": 2.4775905509608047, + "grad_norm": 9.290980339050293, + "learning_rate": 8.706824150653256e-06, + "loss": 2.6161, + "step": 7970000 + }, + { + "epoch": 2.4777459832412916, + "grad_norm": 8.967591285705566, + "learning_rate": 8.704233612645142e-06, + "loss": 2.5752, + "step": 7970500 + }, + { + "epoch": 2.4779014155217784, + "grad_norm": 12.61303997039795, + "learning_rate": 8.701643074637027e-06, + "loss": 2.6196, + "step": 7971000 + }, + { + "epoch": 2.4780568478022653, + "grad_norm": 29.154176712036133, + "learning_rate": 8.699052536628913e-06, + "loss": 2.5614, + "step": 7971500 + }, + { + "epoch": 2.478212280082752, + "grad_norm": 24.20185661315918, + "learning_rate": 8.696461998620798e-06, + "loss": 2.5854, + "step": 7972000 + }, + { + "epoch": 2.478367712363239, + "grad_norm": 11.386316299438477, + "learning_rate": 8.693871460612684e-06, + "loss": 2.581, + "step": 7972500 + }, + { + "epoch": 2.478523144643726, + "grad_norm": 12.430477142333984, + "learning_rate": 8.691280922604569e-06, + "loss": 2.5972, + "step": 7973000 + }, + { + "epoch": 2.478678576924213, + "grad_norm": 9.481046676635742, + "learning_rate": 8.688690384596455e-06, + "loss": 2.6149, + "step": 7973500 + }, + { + "epoch": 2.4788340092046997, + "grad_norm": 11.501994132995605, + "learning_rate": 8.68609984658834e-06, + "loss": 2.609, + "step": 7974000 + }, + { + "epoch": 2.4789894414851865, + "grad_norm": 12.415721893310547, + "learning_rate": 8.683509308580225e-06, + "loss": 2.6106, + "step": 7974500 + }, + { + "epoch": 2.4791448737656734, + "grad_norm": 9.544589042663574, + "learning_rate": 8.680918770572111e-06, + "loss": 2.5901, + "step": 7975000 + }, + { + "epoch": 2.4793003060461603, + "grad_norm": 7.331150054931641, + "learning_rate": 8.678328232563995e-06, + "loss": 2.6207, + "step": 7975500 + }, + { + "epoch": 2.479455738326647, + "grad_norm": 8.443598747253418, + "learning_rate": 8.675737694555882e-06, + "loss": 2.577, + "step": 7976000 + }, + { + "epoch": 2.479611170607134, + "grad_norm": 11.09217643737793, + "learning_rate": 8.673147156547766e-06, + "loss": 2.6195, + "step": 7976500 + }, + { + "epoch": 2.479766602887621, + "grad_norm": 8.60246753692627, + "learning_rate": 8.670556618539653e-06, + "loss": 2.6473, + "step": 7977000 + }, + { + "epoch": 2.4799220351681077, + "grad_norm": 9.385838508605957, + "learning_rate": 8.667966080531538e-06, + "loss": 2.6223, + "step": 7977500 + }, + { + "epoch": 2.4800774674485946, + "grad_norm": 11.976296424865723, + "learning_rate": 8.665375542523422e-06, + "loss": 2.5847, + "step": 7978000 + }, + { + "epoch": 2.4802328997290815, + "grad_norm": 11.904153823852539, + "learning_rate": 8.662785004515309e-06, + "loss": 2.6254, + "step": 7978500 + }, + { + "epoch": 2.4803883320095683, + "grad_norm": 10.407976150512695, + "learning_rate": 8.660194466507193e-06, + "loss": 2.6296, + "step": 7979000 + }, + { + "epoch": 2.480543764290055, + "grad_norm": 11.755988121032715, + "learning_rate": 8.65760392849908e-06, + "loss": 2.6139, + "step": 7979500 + }, + { + "epoch": 2.480699196570542, + "grad_norm": 8.265949249267578, + "learning_rate": 8.655013390490964e-06, + "loss": 2.6197, + "step": 7980000 + }, + { + "epoch": 2.480854628851029, + "grad_norm": 10.657526016235352, + "learning_rate": 8.65242285248285e-06, + "loss": 2.6122, + "step": 7980500 + }, + { + "epoch": 2.481010061131516, + "grad_norm": 11.017593383789062, + "learning_rate": 8.649832314474735e-06, + "loss": 2.6161, + "step": 7981000 + }, + { + "epoch": 2.4811654934120027, + "grad_norm": 11.376787185668945, + "learning_rate": 8.64724177646662e-06, + "loss": 2.5718, + "step": 7981500 + }, + { + "epoch": 2.4813209256924895, + "grad_norm": 9.928119659423828, + "learning_rate": 8.644651238458507e-06, + "loss": 2.6106, + "step": 7982000 + }, + { + "epoch": 2.4814763579729764, + "grad_norm": 13.418272972106934, + "learning_rate": 8.642060700450391e-06, + "loss": 2.6202, + "step": 7982500 + }, + { + "epoch": 2.4816317902534633, + "grad_norm": 9.097712516784668, + "learning_rate": 8.639470162442277e-06, + "loss": 2.6653, + "step": 7983000 + }, + { + "epoch": 2.48178722253395, + "grad_norm": 14.665319442749023, + "learning_rate": 8.636879624434162e-06, + "loss": 2.627, + "step": 7983500 + }, + { + "epoch": 2.481942654814437, + "grad_norm": 9.9442777633667, + "learning_rate": 8.634289086426047e-06, + "loss": 2.6122, + "step": 7984000 + }, + { + "epoch": 2.482098087094924, + "grad_norm": 11.697737693786621, + "learning_rate": 8.631698548417933e-06, + "loss": 2.5826, + "step": 7984500 + }, + { + "epoch": 2.482253519375411, + "grad_norm": 10.375594139099121, + "learning_rate": 8.629108010409818e-06, + "loss": 2.6183, + "step": 7985000 + }, + { + "epoch": 2.4824089516558976, + "grad_norm": 28.799928665161133, + "learning_rate": 8.626517472401704e-06, + "loss": 2.5923, + "step": 7985500 + }, + { + "epoch": 2.482564383936385, + "grad_norm": 10.142996788024902, + "learning_rate": 8.62392693439359e-06, + "loss": 2.6195, + "step": 7986000 + }, + { + "epoch": 2.4827198162168713, + "grad_norm": 8.597224235534668, + "learning_rate": 8.621336396385475e-06, + "loss": 2.6073, + "step": 7986500 + }, + { + "epoch": 2.4828752484973586, + "grad_norm": 15.999691009521484, + "learning_rate": 8.61874585837736e-06, + "loss": 2.5798, + "step": 7987000 + }, + { + "epoch": 2.483030680777845, + "grad_norm": 17.4385929107666, + "learning_rate": 8.616155320369246e-06, + "loss": 2.6528, + "step": 7987500 + }, + { + "epoch": 2.4831861130583324, + "grad_norm": 7.875405311584473, + "learning_rate": 8.613564782361131e-06, + "loss": 2.6126, + "step": 7988000 + }, + { + "epoch": 2.4833415453388192, + "grad_norm": 9.675307273864746, + "learning_rate": 8.610974244353017e-06, + "loss": 2.6322, + "step": 7988500 + }, + { + "epoch": 2.483496977619306, + "grad_norm": 10.421036720275879, + "learning_rate": 8.6083837063449e-06, + "loss": 2.6126, + "step": 7989000 + }, + { + "epoch": 2.483652409899793, + "grad_norm": 10.263211250305176, + "learning_rate": 8.605793168336788e-06, + "loss": 2.5932, + "step": 7989500 + }, + { + "epoch": 2.48380784218028, + "grad_norm": 30.88195037841797, + "learning_rate": 8.603202630328671e-06, + "loss": 2.6219, + "step": 7990000 + }, + { + "epoch": 2.4839632744607667, + "grad_norm": 10.999700546264648, + "learning_rate": 8.600612092320558e-06, + "loss": 2.6377, + "step": 7990500 + }, + { + "epoch": 2.4841187067412536, + "grad_norm": 11.465639114379883, + "learning_rate": 8.598021554312444e-06, + "loss": 2.6304, + "step": 7991000 + }, + { + "epoch": 2.4842741390217404, + "grad_norm": 10.499449729919434, + "learning_rate": 8.595431016304328e-06, + "loss": 2.5879, + "step": 7991500 + }, + { + "epoch": 2.4844295713022273, + "grad_norm": 8.966840744018555, + "learning_rate": 8.592840478296215e-06, + "loss": 2.6181, + "step": 7992000 + }, + { + "epoch": 2.484585003582714, + "grad_norm": 24.757699966430664, + "learning_rate": 8.590249940288099e-06, + "loss": 2.6105, + "step": 7992500 + }, + { + "epoch": 2.484740435863201, + "grad_norm": 9.411672592163086, + "learning_rate": 8.587659402279986e-06, + "loss": 2.5723, + "step": 7993000 + }, + { + "epoch": 2.484895868143688, + "grad_norm": 13.88487720489502, + "learning_rate": 8.58506886427187e-06, + "loss": 2.6079, + "step": 7993500 + }, + { + "epoch": 2.4850513004241748, + "grad_norm": 16.59061622619629, + "learning_rate": 8.582478326263755e-06, + "loss": 2.602, + "step": 7994000 + }, + { + "epoch": 2.4852067327046616, + "grad_norm": 9.276825904846191, + "learning_rate": 8.57988778825564e-06, + "loss": 2.6924, + "step": 7994500 + }, + { + "epoch": 2.4853621649851485, + "grad_norm": 9.441208839416504, + "learning_rate": 8.577297250247526e-06, + "loss": 2.5777, + "step": 7995000 + }, + { + "epoch": 2.4855175972656354, + "grad_norm": 11.403205871582031, + "learning_rate": 8.574706712239413e-06, + "loss": 2.6629, + "step": 7995500 + }, + { + "epoch": 2.4856730295461222, + "grad_norm": 8.996880531311035, + "learning_rate": 8.572116174231297e-06, + "loss": 2.624, + "step": 7996000 + }, + { + "epoch": 2.485828461826609, + "grad_norm": 9.531983375549316, + "learning_rate": 8.569525636223182e-06, + "loss": 2.5924, + "step": 7996500 + }, + { + "epoch": 2.485983894107096, + "grad_norm": 9.96002197265625, + "learning_rate": 8.566935098215068e-06, + "loss": 2.606, + "step": 7997000 + }, + { + "epoch": 2.486139326387583, + "grad_norm": 9.510645866394043, + "learning_rate": 8.564344560206953e-06, + "loss": 2.5863, + "step": 7997500 + }, + { + "epoch": 2.4862947586680697, + "grad_norm": 11.858169555664062, + "learning_rate": 8.561754022198839e-06, + "loss": 2.6233, + "step": 7998000 + }, + { + "epoch": 2.4864501909485566, + "grad_norm": 11.053232192993164, + "learning_rate": 8.559163484190724e-06, + "loss": 2.6273, + "step": 7998500 + }, + { + "epoch": 2.4866056232290434, + "grad_norm": 9.941231727600098, + "learning_rate": 8.55657294618261e-06, + "loss": 2.6297, + "step": 7999000 + }, + { + "epoch": 2.4867610555095303, + "grad_norm": 7.314128875732422, + "learning_rate": 8.553982408174495e-06, + "loss": 2.6504, + "step": 7999500 + }, + { + "epoch": 2.486916487790017, + "grad_norm": 10.390095710754395, + "learning_rate": 8.55139187016638e-06, + "loss": 2.6715, + "step": 8000000 + }, + { + "epoch": 2.487071920070504, + "grad_norm": 8.396154403686523, + "learning_rate": 8.548801332158266e-06, + "loss": 2.5737, + "step": 8000500 + }, + { + "epoch": 2.487227352350991, + "grad_norm": 11.365593910217285, + "learning_rate": 8.546210794150151e-06, + "loss": 2.6463, + "step": 8001000 + }, + { + "epoch": 2.4873827846314778, + "grad_norm": 7.34208869934082, + "learning_rate": 8.543620256142037e-06, + "loss": 2.6621, + "step": 8001500 + }, + { + "epoch": 2.4875382169119646, + "grad_norm": 11.256928443908691, + "learning_rate": 8.541029718133922e-06, + "loss": 2.5872, + "step": 8002000 + }, + { + "epoch": 2.4876936491924515, + "grad_norm": 10.319540023803711, + "learning_rate": 8.538439180125806e-06, + "loss": 2.5993, + "step": 8002500 + }, + { + "epoch": 2.4878490814729384, + "grad_norm": 11.648587226867676, + "learning_rate": 8.535848642117693e-06, + "loss": 2.5759, + "step": 8003000 + }, + { + "epoch": 2.4880045137534252, + "grad_norm": 9.849802017211914, + "learning_rate": 8.533258104109577e-06, + "loss": 2.6247, + "step": 8003500 + }, + { + "epoch": 2.488159946033912, + "grad_norm": 9.350398063659668, + "learning_rate": 8.530667566101464e-06, + "loss": 2.6276, + "step": 8004000 + }, + { + "epoch": 2.488315378314399, + "grad_norm": 28.33826446533203, + "learning_rate": 8.52807702809335e-06, + "loss": 2.6154, + "step": 8004500 + }, + { + "epoch": 2.488470810594886, + "grad_norm": 9.060592651367188, + "learning_rate": 8.525486490085233e-06, + "loss": 2.6006, + "step": 8005000 + }, + { + "epoch": 2.4886262428753727, + "grad_norm": 10.287755012512207, + "learning_rate": 8.52289595207712e-06, + "loss": 2.6091, + "step": 8005500 + }, + { + "epoch": 2.4887816751558596, + "grad_norm": 11.334108352661133, + "learning_rate": 8.520305414069004e-06, + "loss": 2.6274, + "step": 8006000 + }, + { + "epoch": 2.4889371074363464, + "grad_norm": 10.111272811889648, + "learning_rate": 8.517714876060891e-06, + "loss": 2.6431, + "step": 8006500 + }, + { + "epoch": 2.4890925397168333, + "grad_norm": 8.94058895111084, + "learning_rate": 8.515124338052775e-06, + "loss": 2.5974, + "step": 8007000 + }, + { + "epoch": 2.48924797199732, + "grad_norm": 9.771383285522461, + "learning_rate": 8.51253380004466e-06, + "loss": 2.6235, + "step": 8007500 + }, + { + "epoch": 2.489403404277807, + "grad_norm": 11.005818367004395, + "learning_rate": 8.509943262036546e-06, + "loss": 2.5957, + "step": 8008000 + }, + { + "epoch": 2.489558836558294, + "grad_norm": 10.65101146697998, + "learning_rate": 8.507352724028432e-06, + "loss": 2.5939, + "step": 8008500 + }, + { + "epoch": 2.4897142688387808, + "grad_norm": 11.283763885498047, + "learning_rate": 8.504762186020319e-06, + "loss": 2.6536, + "step": 8009000 + }, + { + "epoch": 2.489869701119268, + "grad_norm": 11.439658164978027, + "learning_rate": 8.502171648012203e-06, + "loss": 2.6044, + "step": 8009500 + }, + { + "epoch": 2.4900251333997545, + "grad_norm": 11.222257614135742, + "learning_rate": 8.499581110004088e-06, + "loss": 2.6151, + "step": 8010000 + }, + { + "epoch": 2.490180565680242, + "grad_norm": 10.153009414672852, + "learning_rate": 8.496990571995973e-06, + "loss": 2.6059, + "step": 8010500 + }, + { + "epoch": 2.4903359979607282, + "grad_norm": 11.722087860107422, + "learning_rate": 8.494400033987859e-06, + "loss": 2.6363, + "step": 8011000 + }, + { + "epoch": 2.4904914302412156, + "grad_norm": 24.47235870361328, + "learning_rate": 8.491809495979744e-06, + "loss": 2.6506, + "step": 8011500 + }, + { + "epoch": 2.4906468625217024, + "grad_norm": 19.015478134155273, + "learning_rate": 8.48921895797163e-06, + "loss": 2.5934, + "step": 8012000 + }, + { + "epoch": 2.4908022948021893, + "grad_norm": 9.034960746765137, + "learning_rate": 8.486628419963515e-06, + "loss": 2.6182, + "step": 8012500 + }, + { + "epoch": 2.490957727082676, + "grad_norm": 12.8160400390625, + "learning_rate": 8.4840378819554e-06, + "loss": 2.5927, + "step": 8013000 + }, + { + "epoch": 2.491113159363163, + "grad_norm": 10.299571990966797, + "learning_rate": 8.481447343947286e-06, + "loss": 2.617, + "step": 8013500 + }, + { + "epoch": 2.49126859164365, + "grad_norm": 9.880298614501953, + "learning_rate": 8.478856805939172e-06, + "loss": 2.6153, + "step": 8014000 + }, + { + "epoch": 2.4914240239241368, + "grad_norm": 13.162077903747559, + "learning_rate": 8.476266267931057e-06, + "loss": 2.5873, + "step": 8014500 + }, + { + "epoch": 2.4915794562046236, + "grad_norm": 9.474067687988281, + "learning_rate": 8.473675729922943e-06, + "loss": 2.597, + "step": 8015000 + }, + { + "epoch": 2.4917348884851105, + "grad_norm": 9.34693431854248, + "learning_rate": 8.471085191914828e-06, + "loss": 2.6586, + "step": 8015500 + }, + { + "epoch": 2.4918903207655974, + "grad_norm": 9.008691787719727, + "learning_rate": 8.468494653906712e-06, + "loss": 2.6075, + "step": 8016000 + }, + { + "epoch": 2.4920457530460842, + "grad_norm": 8.927879333496094, + "learning_rate": 8.465904115898599e-06, + "loss": 2.5847, + "step": 8016500 + }, + { + "epoch": 2.492201185326571, + "grad_norm": 8.883443832397461, + "learning_rate": 8.463313577890483e-06, + "loss": 2.6, + "step": 8017000 + }, + { + "epoch": 2.492356617607058, + "grad_norm": 11.619099617004395, + "learning_rate": 8.46072303988237e-06, + "loss": 2.6283, + "step": 8017500 + }, + { + "epoch": 2.492512049887545, + "grad_norm": 12.230480194091797, + "learning_rate": 8.458132501874255e-06, + "loss": 2.5659, + "step": 8018000 + }, + { + "epoch": 2.4926674821680317, + "grad_norm": 10.77840518951416, + "learning_rate": 8.455541963866139e-06, + "loss": 2.6043, + "step": 8018500 + }, + { + "epoch": 2.4928229144485186, + "grad_norm": 25.80297088623047, + "learning_rate": 8.452951425858026e-06, + "loss": 2.604, + "step": 8019000 + }, + { + "epoch": 2.4929783467290054, + "grad_norm": 43.108726501464844, + "learning_rate": 8.45036088784991e-06, + "loss": 2.5906, + "step": 8019500 + }, + { + "epoch": 2.4931337790094923, + "grad_norm": 13.588606834411621, + "learning_rate": 8.447770349841797e-06, + "loss": 2.6022, + "step": 8020000 + }, + { + "epoch": 2.493289211289979, + "grad_norm": 9.702062606811523, + "learning_rate": 8.445179811833681e-06, + "loss": 2.6224, + "step": 8020500 + }, + { + "epoch": 2.493444643570466, + "grad_norm": 10.255958557128906, + "learning_rate": 8.442589273825566e-06, + "loss": 2.5899, + "step": 8021000 + }, + { + "epoch": 2.493600075850953, + "grad_norm": 10.101166725158691, + "learning_rate": 8.439998735817452e-06, + "loss": 2.5996, + "step": 8021500 + }, + { + "epoch": 2.4937555081314398, + "grad_norm": 8.47763442993164, + "learning_rate": 8.437408197809337e-06, + "loss": 2.6357, + "step": 8022000 + }, + { + "epoch": 2.4939109404119266, + "grad_norm": 10.476099967956543, + "learning_rate": 8.434817659801224e-06, + "loss": 2.6666, + "step": 8022500 + }, + { + "epoch": 2.4940663726924135, + "grad_norm": 9.08853816986084, + "learning_rate": 8.432227121793108e-06, + "loss": 2.6298, + "step": 8023000 + }, + { + "epoch": 2.4942218049729004, + "grad_norm": 13.266302108764648, + "learning_rate": 8.429636583784994e-06, + "loss": 2.6539, + "step": 8023500 + }, + { + "epoch": 2.4943772372533872, + "grad_norm": 10.168411254882812, + "learning_rate": 8.427046045776879e-06, + "loss": 2.6222, + "step": 8024000 + }, + { + "epoch": 2.494532669533874, + "grad_norm": 8.733831405639648, + "learning_rate": 8.424455507768765e-06, + "loss": 2.6016, + "step": 8024500 + }, + { + "epoch": 2.494688101814361, + "grad_norm": 9.381916999816895, + "learning_rate": 8.42186496976065e-06, + "loss": 2.6529, + "step": 8025000 + }, + { + "epoch": 2.494843534094848, + "grad_norm": 10.40811824798584, + "learning_rate": 8.419274431752536e-06, + "loss": 2.5624, + "step": 8025500 + }, + { + "epoch": 2.4949989663753347, + "grad_norm": 15.632596015930176, + "learning_rate": 8.416683893744421e-06, + "loss": 2.6241, + "step": 8026000 + }, + { + "epoch": 2.4951543986558216, + "grad_norm": 10.647933959960938, + "learning_rate": 8.414093355736306e-06, + "loss": 2.6179, + "step": 8026500 + }, + { + "epoch": 2.4953098309363084, + "grad_norm": 9.95867919921875, + "learning_rate": 8.411502817728192e-06, + "loss": 2.6177, + "step": 8027000 + }, + { + "epoch": 2.4954652632167953, + "grad_norm": 13.763324737548828, + "learning_rate": 8.408912279720077e-06, + "loss": 2.5896, + "step": 8027500 + }, + { + "epoch": 2.495620695497282, + "grad_norm": 10.413973808288574, + "learning_rate": 8.406321741711963e-06, + "loss": 2.6033, + "step": 8028000 + }, + { + "epoch": 2.495776127777769, + "grad_norm": 12.056183815002441, + "learning_rate": 8.403731203703848e-06, + "loss": 2.633, + "step": 8028500 + }, + { + "epoch": 2.495931560058256, + "grad_norm": 9.414812088012695, + "learning_rate": 8.401140665695734e-06, + "loss": 2.6215, + "step": 8029000 + }, + { + "epoch": 2.4960869923387428, + "grad_norm": 11.68908977508545, + "learning_rate": 8.398550127687618e-06, + "loss": 2.6466, + "step": 8029500 + }, + { + "epoch": 2.4962424246192296, + "grad_norm": 9.674376487731934, + "learning_rate": 8.395959589679505e-06, + "loss": 2.6072, + "step": 8030000 + }, + { + "epoch": 2.4963978568997165, + "grad_norm": 11.176316261291504, + "learning_rate": 8.393369051671388e-06, + "loss": 2.5784, + "step": 8030500 + }, + { + "epoch": 2.4965532891802034, + "grad_norm": 10.333516120910645, + "learning_rate": 8.390778513663276e-06, + "loss": 2.611, + "step": 8031000 + }, + { + "epoch": 2.4967087214606902, + "grad_norm": 14.101252555847168, + "learning_rate": 8.388187975655161e-06, + "loss": 2.6054, + "step": 8031500 + }, + { + "epoch": 2.496864153741177, + "grad_norm": 9.254341125488281, + "learning_rate": 8.385597437647045e-06, + "loss": 2.6084, + "step": 8032000 + }, + { + "epoch": 2.497019586021664, + "grad_norm": 12.138935089111328, + "learning_rate": 8.383006899638932e-06, + "loss": 2.6282, + "step": 8032500 + }, + { + "epoch": 2.4971750183021513, + "grad_norm": 29.432287216186523, + "learning_rate": 8.380416361630816e-06, + "loss": 2.5554, + "step": 8033000 + }, + { + "epoch": 2.4973304505826377, + "grad_norm": 12.66566276550293, + "learning_rate": 8.377825823622703e-06, + "loss": 2.6161, + "step": 8033500 + }, + { + "epoch": 2.497485882863125, + "grad_norm": 9.740914344787598, + "learning_rate": 8.375235285614587e-06, + "loss": 2.6674, + "step": 8034000 + }, + { + "epoch": 2.4976413151436114, + "grad_norm": 13.948325157165527, + "learning_rate": 8.372644747606472e-06, + "loss": 2.6071, + "step": 8034500 + }, + { + "epoch": 2.4977967474240987, + "grad_norm": 8.790169715881348, + "learning_rate": 8.37005420959836e-06, + "loss": 2.6171, + "step": 8035000 + }, + { + "epoch": 2.497952179704585, + "grad_norm": 10.106468200683594, + "learning_rate": 8.367463671590243e-06, + "loss": 2.6061, + "step": 8035500 + }, + { + "epoch": 2.4981076119850725, + "grad_norm": 10.972124099731445, + "learning_rate": 8.36487313358213e-06, + "loss": 2.5937, + "step": 8036000 + }, + { + "epoch": 2.4982630442655593, + "grad_norm": 8.791133880615234, + "learning_rate": 8.362282595574014e-06, + "loss": 2.5947, + "step": 8036500 + }, + { + "epoch": 2.498418476546046, + "grad_norm": 12.869220733642578, + "learning_rate": 8.3596920575659e-06, + "loss": 2.6409, + "step": 8037000 + }, + { + "epoch": 2.498573908826533, + "grad_norm": 9.772428512573242, + "learning_rate": 8.357101519557785e-06, + "loss": 2.5872, + "step": 8037500 + }, + { + "epoch": 2.49872934110702, + "grad_norm": 11.679733276367188, + "learning_rate": 8.35451098154967e-06, + "loss": 2.6026, + "step": 8038000 + }, + { + "epoch": 2.498884773387507, + "grad_norm": 9.889081001281738, + "learning_rate": 8.351920443541556e-06, + "loss": 2.6172, + "step": 8038500 + }, + { + "epoch": 2.4990402056679937, + "grad_norm": 10.656824111938477, + "learning_rate": 8.349329905533441e-06, + "loss": 2.6551, + "step": 8039000 + }, + { + "epoch": 2.4991956379484805, + "grad_norm": 10.542074203491211, + "learning_rate": 8.346739367525327e-06, + "loss": 2.5918, + "step": 8039500 + }, + { + "epoch": 2.4993510702289674, + "grad_norm": 8.281981468200684, + "learning_rate": 8.344148829517212e-06, + "loss": 2.609, + "step": 8040000 + }, + { + "epoch": 2.4995065025094543, + "grad_norm": 14.636100769042969, + "learning_rate": 8.341558291509098e-06, + "loss": 2.6092, + "step": 8040500 + }, + { + "epoch": 2.499661934789941, + "grad_norm": 10.443163871765137, + "learning_rate": 8.338967753500983e-06, + "loss": 2.6041, + "step": 8041000 + }, + { + "epoch": 2.499817367070428, + "grad_norm": 10.192425727844238, + "learning_rate": 8.336377215492869e-06, + "loss": 2.6621, + "step": 8041500 + }, + { + "epoch": 2.499972799350915, + "grad_norm": 11.369735717773438, + "learning_rate": 8.333786677484754e-06, + "loss": 2.6213, + "step": 8042000 + }, + { + "epoch": 2.5001282316314017, + "grad_norm": 8.232900619506836, + "learning_rate": 8.33119613947664e-06, + "loss": 2.6231, + "step": 8042500 + }, + { + "epoch": 2.5002836639118886, + "grad_norm": 9.207747459411621, + "learning_rate": 8.328605601468523e-06, + "loss": 2.6408, + "step": 8043000 + }, + { + "epoch": 2.5004390961923755, + "grad_norm": 7.878885269165039, + "learning_rate": 8.32601506346041e-06, + "loss": 2.6338, + "step": 8043500 + }, + { + "epoch": 2.5005945284728623, + "grad_norm": 11.422514915466309, + "learning_rate": 8.323424525452296e-06, + "loss": 2.6512, + "step": 8044000 + }, + { + "epoch": 2.500749960753349, + "grad_norm": 7.595954895019531, + "learning_rate": 8.320833987444181e-06, + "loss": 2.5445, + "step": 8044500 + }, + { + "epoch": 2.500905393033836, + "grad_norm": 9.307751655578613, + "learning_rate": 8.318243449436067e-06, + "loss": 2.6211, + "step": 8045000 + }, + { + "epoch": 2.501060825314323, + "grad_norm": 11.512835502624512, + "learning_rate": 8.31565291142795e-06, + "loss": 2.6037, + "step": 8045500 + }, + { + "epoch": 2.50121625759481, + "grad_norm": 8.3787260055542, + "learning_rate": 8.313062373419838e-06, + "loss": 2.5929, + "step": 8046000 + }, + { + "epoch": 2.5013716898752967, + "grad_norm": 10.157052993774414, + "learning_rate": 8.310471835411721e-06, + "loss": 2.5869, + "step": 8046500 + }, + { + "epoch": 2.5015271221557835, + "grad_norm": 10.006987571716309, + "learning_rate": 8.307881297403609e-06, + "loss": 2.5871, + "step": 8047000 + }, + { + "epoch": 2.5016825544362704, + "grad_norm": 10.573816299438477, + "learning_rate": 8.305290759395492e-06, + "loss": 2.6118, + "step": 8047500 + }, + { + "epoch": 2.5018379867167573, + "grad_norm": 10.159104347229004, + "learning_rate": 8.302700221387378e-06, + "loss": 2.6025, + "step": 8048000 + }, + { + "epoch": 2.501993418997244, + "grad_norm": 19.17862892150879, + "learning_rate": 8.300109683379265e-06, + "loss": 2.6093, + "step": 8048500 + }, + { + "epoch": 2.502148851277731, + "grad_norm": 9.658622741699219, + "learning_rate": 8.297519145371149e-06, + "loss": 2.569, + "step": 8049000 + }, + { + "epoch": 2.502304283558218, + "grad_norm": 10.076430320739746, + "learning_rate": 8.294928607363036e-06, + "loss": 2.6348, + "step": 8049500 + }, + { + "epoch": 2.5024597158387047, + "grad_norm": 10.436223030090332, + "learning_rate": 8.29233806935492e-06, + "loss": 2.6085, + "step": 8050000 + }, + { + "epoch": 2.5026151481191916, + "grad_norm": 8.937577247619629, + "learning_rate": 8.289747531346805e-06, + "loss": 2.6506, + "step": 8050500 + }, + { + "epoch": 2.5027705803996785, + "grad_norm": 10.456930160522461, + "learning_rate": 8.28715699333869e-06, + "loss": 2.6025, + "step": 8051000 + }, + { + "epoch": 2.5029260126801653, + "grad_norm": 11.68314266204834, + "learning_rate": 8.284566455330576e-06, + "loss": 2.6407, + "step": 8051500 + }, + { + "epoch": 2.503081444960652, + "grad_norm": 9.738984107971191, + "learning_rate": 8.281975917322461e-06, + "loss": 2.6044, + "step": 8052000 + }, + { + "epoch": 2.503236877241139, + "grad_norm": 11.151161193847656, + "learning_rate": 8.279385379314347e-06, + "loss": 2.5559, + "step": 8052500 + }, + { + "epoch": 2.503392309521626, + "grad_norm": 16.21845245361328, + "learning_rate": 8.276794841306232e-06, + "loss": 2.6517, + "step": 8053000 + }, + { + "epoch": 2.503547741802113, + "grad_norm": 10.972716331481934, + "learning_rate": 8.274204303298118e-06, + "loss": 2.6071, + "step": 8053500 + }, + { + "epoch": 2.5037031740825997, + "grad_norm": 17.6455135345459, + "learning_rate": 8.271613765290003e-06, + "loss": 2.6593, + "step": 8054000 + }, + { + "epoch": 2.5038586063630865, + "grad_norm": 9.512933731079102, + "learning_rate": 8.269023227281889e-06, + "loss": 2.6087, + "step": 8054500 + }, + { + "epoch": 2.5040140386435734, + "grad_norm": 9.5897855758667, + "learning_rate": 8.266432689273774e-06, + "loss": 2.6042, + "step": 8055000 + }, + { + "epoch": 2.5041694709240607, + "grad_norm": 9.482084274291992, + "learning_rate": 8.26384215126566e-06, + "loss": 2.5619, + "step": 8055500 + }, + { + "epoch": 2.504324903204547, + "grad_norm": 14.693626403808594, + "learning_rate": 8.261251613257545e-06, + "loss": 2.6218, + "step": 8056000 + }, + { + "epoch": 2.5044803354850345, + "grad_norm": 8.655763626098633, + "learning_rate": 8.25866107524943e-06, + "loss": 2.6648, + "step": 8056500 + }, + { + "epoch": 2.504635767765521, + "grad_norm": 9.211015701293945, + "learning_rate": 8.256070537241316e-06, + "loss": 2.5828, + "step": 8057000 + }, + { + "epoch": 2.504791200046008, + "grad_norm": 9.736204147338867, + "learning_rate": 8.253479999233202e-06, + "loss": 2.6411, + "step": 8057500 + }, + { + "epoch": 2.5049466323264946, + "grad_norm": 11.591126441955566, + "learning_rate": 8.250889461225087e-06, + "loss": 2.6282, + "step": 8058000 + }, + { + "epoch": 2.505102064606982, + "grad_norm": 9.500598907470703, + "learning_rate": 8.248298923216972e-06, + "loss": 2.5941, + "step": 8058500 + }, + { + "epoch": 2.5052574968874683, + "grad_norm": 9.39250373840332, + "learning_rate": 8.245708385208858e-06, + "loss": 2.6074, + "step": 8059000 + }, + { + "epoch": 2.5054129291679557, + "grad_norm": 9.680051803588867, + "learning_rate": 8.243117847200743e-06, + "loss": 2.5939, + "step": 8059500 + }, + { + "epoch": 2.505568361448442, + "grad_norm": 10.36780071258545, + "learning_rate": 8.240527309192627e-06, + "loss": 2.5782, + "step": 8060000 + }, + { + "epoch": 2.5057237937289294, + "grad_norm": 31.79720115661621, + "learning_rate": 8.237936771184514e-06, + "loss": 2.5608, + "step": 8060500 + }, + { + "epoch": 2.505879226009416, + "grad_norm": 9.418802261352539, + "learning_rate": 8.235346233176398e-06, + "loss": 2.6282, + "step": 8061000 + }, + { + "epoch": 2.506034658289903, + "grad_norm": 17.96009635925293, + "learning_rate": 8.232755695168285e-06, + "loss": 2.5792, + "step": 8061500 + }, + { + "epoch": 2.50619009057039, + "grad_norm": 11.40771770477295, + "learning_rate": 8.23016515716017e-06, + "loss": 2.6129, + "step": 8062000 + }, + { + "epoch": 2.506345522850877, + "grad_norm": 24.897077560424805, + "learning_rate": 8.227574619152054e-06, + "loss": 2.5922, + "step": 8062500 + }, + { + "epoch": 2.5065009551313637, + "grad_norm": 9.286513328552246, + "learning_rate": 8.224984081143942e-06, + "loss": 2.6074, + "step": 8063000 + }, + { + "epoch": 2.5066563874118506, + "grad_norm": 11.437042236328125, + "learning_rate": 8.222393543135825e-06, + "loss": 2.5677, + "step": 8063500 + }, + { + "epoch": 2.5068118196923375, + "grad_norm": 10.826619148254395, + "learning_rate": 8.219803005127713e-06, + "loss": 2.6011, + "step": 8064000 + }, + { + "epoch": 2.5069672519728243, + "grad_norm": 9.33775806427002, + "learning_rate": 8.217212467119596e-06, + "loss": 2.6118, + "step": 8064500 + }, + { + "epoch": 2.507122684253311, + "grad_norm": 27.31100082397461, + "learning_rate": 8.214621929111482e-06, + "loss": 2.5604, + "step": 8065000 + }, + { + "epoch": 2.507278116533798, + "grad_norm": 10.147936820983887, + "learning_rate": 8.212031391103367e-06, + "loss": 2.5796, + "step": 8065500 + }, + { + "epoch": 2.507433548814285, + "grad_norm": 11.151575088500977, + "learning_rate": 8.209440853095253e-06, + "loss": 2.5596, + "step": 8066000 + }, + { + "epoch": 2.507588981094772, + "grad_norm": 8.865436553955078, + "learning_rate": 8.20685031508714e-06, + "loss": 2.6006, + "step": 8066500 + }, + { + "epoch": 2.5077444133752587, + "grad_norm": 9.60867977142334, + "learning_rate": 8.204259777079024e-06, + "loss": 2.5924, + "step": 8067000 + }, + { + "epoch": 2.5078998456557455, + "grad_norm": 8.318349838256836, + "learning_rate": 8.201669239070909e-06, + "loss": 2.623, + "step": 8067500 + }, + { + "epoch": 2.5080552779362324, + "grad_norm": 13.405839920043945, + "learning_rate": 8.199078701062794e-06, + "loss": 2.6178, + "step": 8068000 + }, + { + "epoch": 2.5082107102167193, + "grad_norm": 9.784021377563477, + "learning_rate": 8.19648816305468e-06, + "loss": 2.6313, + "step": 8068500 + }, + { + "epoch": 2.508366142497206, + "grad_norm": 10.501080513000488, + "learning_rate": 8.193897625046565e-06, + "loss": 2.6481, + "step": 8069000 + }, + { + "epoch": 2.508521574777693, + "grad_norm": 7.723010540008545, + "learning_rate": 8.191307087038451e-06, + "loss": 2.6096, + "step": 8069500 + }, + { + "epoch": 2.50867700705818, + "grad_norm": 9.54172134399414, + "learning_rate": 8.188716549030336e-06, + "loss": 2.6018, + "step": 8070000 + }, + { + "epoch": 2.5088324393386667, + "grad_norm": 8.856313705444336, + "learning_rate": 8.186126011022222e-06, + "loss": 2.6653, + "step": 8070500 + }, + { + "epoch": 2.5089878716191536, + "grad_norm": 9.624457359313965, + "learning_rate": 8.183535473014107e-06, + "loss": 2.6596, + "step": 8071000 + }, + { + "epoch": 2.5091433038996405, + "grad_norm": 18.468488693237305, + "learning_rate": 8.180944935005993e-06, + "loss": 2.6179, + "step": 8071500 + }, + { + "epoch": 2.5092987361801273, + "grad_norm": 8.926183700561523, + "learning_rate": 8.178354396997878e-06, + "loss": 2.6159, + "step": 8072000 + }, + { + "epoch": 2.509454168460614, + "grad_norm": 6.6826605796813965, + "learning_rate": 8.175763858989764e-06, + "loss": 2.6016, + "step": 8072500 + }, + { + "epoch": 2.509609600741101, + "grad_norm": 14.060791969299316, + "learning_rate": 8.173173320981649e-06, + "loss": 2.5848, + "step": 8073000 + }, + { + "epoch": 2.509765033021588, + "grad_norm": 9.725632667541504, + "learning_rate": 8.170582782973533e-06, + "loss": 2.5855, + "step": 8073500 + }, + { + "epoch": 2.509920465302075, + "grad_norm": 7.7008137702941895, + "learning_rate": 8.16799224496542e-06, + "loss": 2.6003, + "step": 8074000 + }, + { + "epoch": 2.5100758975825617, + "grad_norm": 28.158390045166016, + "learning_rate": 8.165401706957304e-06, + "loss": 2.5827, + "step": 8074500 + }, + { + "epoch": 2.5102313298630485, + "grad_norm": 11.740840911865234, + "learning_rate": 8.162811168949191e-06, + "loss": 2.5625, + "step": 8075000 + }, + { + "epoch": 2.5103867621435354, + "grad_norm": 9.565690994262695, + "learning_rate": 8.160220630941076e-06, + "loss": 2.6152, + "step": 8075500 + }, + { + "epoch": 2.5105421944240223, + "grad_norm": 9.07949161529541, + "learning_rate": 8.15763009293296e-06, + "loss": 2.6312, + "step": 8076000 + }, + { + "epoch": 2.510697626704509, + "grad_norm": 8.923215866088867, + "learning_rate": 8.155039554924847e-06, + "loss": 2.632, + "step": 8076500 + }, + { + "epoch": 2.510853058984996, + "grad_norm": 10.705187797546387, + "learning_rate": 8.152449016916731e-06, + "loss": 2.6084, + "step": 8077000 + }, + { + "epoch": 2.511008491265483, + "grad_norm": 11.079221725463867, + "learning_rate": 8.149858478908618e-06, + "loss": 2.612, + "step": 8077500 + }, + { + "epoch": 2.5111639235459697, + "grad_norm": 9.411417007446289, + "learning_rate": 8.147267940900502e-06, + "loss": 2.6166, + "step": 8078000 + }, + { + "epoch": 2.5113193558264566, + "grad_norm": 8.743462562561035, + "learning_rate": 8.144677402892387e-06, + "loss": 2.5945, + "step": 8078500 + }, + { + "epoch": 2.5114747881069435, + "grad_norm": 13.62913990020752, + "learning_rate": 8.142086864884273e-06, + "loss": 2.6218, + "step": 8079000 + }, + { + "epoch": 2.5116302203874303, + "grad_norm": 13.229246139526367, + "learning_rate": 8.139496326876158e-06, + "loss": 2.6232, + "step": 8079500 + }, + { + "epoch": 2.5117856526679176, + "grad_norm": 8.581465721130371, + "learning_rate": 8.136905788868046e-06, + "loss": 2.6221, + "step": 8080000 + }, + { + "epoch": 2.511941084948404, + "grad_norm": 9.626025199890137, + "learning_rate": 8.13431525085993e-06, + "loss": 2.6, + "step": 8080500 + }, + { + "epoch": 2.5120965172288914, + "grad_norm": 10.698192596435547, + "learning_rate": 8.131724712851815e-06, + "loss": 2.6061, + "step": 8081000 + }, + { + "epoch": 2.512251949509378, + "grad_norm": 10.155649185180664, + "learning_rate": 8.1291341748437e-06, + "loss": 2.6113, + "step": 8081500 + }, + { + "epoch": 2.512407381789865, + "grad_norm": 9.371596336364746, + "learning_rate": 8.126543636835586e-06, + "loss": 2.6051, + "step": 8082000 + }, + { + "epoch": 2.5125628140703515, + "grad_norm": 26.29619789123535, + "learning_rate": 8.123953098827471e-06, + "loss": 2.6397, + "step": 8082500 + }, + { + "epoch": 2.512718246350839, + "grad_norm": 10.375480651855469, + "learning_rate": 8.121362560819357e-06, + "loss": 2.6536, + "step": 8083000 + }, + { + "epoch": 2.5128736786313253, + "grad_norm": 5.3334760665893555, + "learning_rate": 8.118772022811242e-06, + "loss": 2.5938, + "step": 8083500 + }, + { + "epoch": 2.5130291109118126, + "grad_norm": 10.086828231811523, + "learning_rate": 8.116181484803127e-06, + "loss": 2.5813, + "step": 8084000 + }, + { + "epoch": 2.513184543192299, + "grad_norm": 9.449143409729004, + "learning_rate": 8.113590946795013e-06, + "loss": 2.6101, + "step": 8084500 + }, + { + "epoch": 2.5133399754727863, + "grad_norm": 14.196595191955566, + "learning_rate": 8.111000408786898e-06, + "loss": 2.5759, + "step": 8085000 + }, + { + "epoch": 2.513495407753273, + "grad_norm": 10.036832809448242, + "learning_rate": 8.108409870778784e-06, + "loss": 2.5596, + "step": 8085500 + }, + { + "epoch": 2.51365084003376, + "grad_norm": 8.155187606811523, + "learning_rate": 8.10581933277067e-06, + "loss": 2.5809, + "step": 8086000 + }, + { + "epoch": 2.513806272314247, + "grad_norm": 10.062775611877441, + "learning_rate": 8.103228794762555e-06, + "loss": 2.5663, + "step": 8086500 + }, + { + "epoch": 2.513961704594734, + "grad_norm": 11.201189994812012, + "learning_rate": 8.100638256754439e-06, + "loss": 2.6136, + "step": 8087000 + }, + { + "epoch": 2.5141171368752206, + "grad_norm": 9.833619117736816, + "learning_rate": 8.098047718746326e-06, + "loss": 2.5912, + "step": 8087500 + }, + { + "epoch": 2.5142725691557075, + "grad_norm": 9.0213041305542, + "learning_rate": 8.09545718073821e-06, + "loss": 2.6587, + "step": 8088000 + }, + { + "epoch": 2.5144280014361944, + "grad_norm": 11.406463623046875, + "learning_rate": 8.092866642730097e-06, + "loss": 2.5836, + "step": 8088500 + }, + { + "epoch": 2.5145834337166812, + "grad_norm": 11.372011184692383, + "learning_rate": 8.090276104721982e-06, + "loss": 2.6016, + "step": 8089000 + }, + { + "epoch": 2.514738865997168, + "grad_norm": 12.18859577178955, + "learning_rate": 8.087685566713866e-06, + "loss": 2.64, + "step": 8089500 + }, + { + "epoch": 2.514894298277655, + "grad_norm": 10.16530990600586, + "learning_rate": 8.085095028705753e-06, + "loss": 2.6186, + "step": 8090000 + }, + { + "epoch": 2.515049730558142, + "grad_norm": 10.543021202087402, + "learning_rate": 8.082504490697637e-06, + "loss": 2.6033, + "step": 8090500 + }, + { + "epoch": 2.5152051628386287, + "grad_norm": 11.013890266418457, + "learning_rate": 8.079913952689524e-06, + "loss": 2.5892, + "step": 8091000 + }, + { + "epoch": 2.5153605951191156, + "grad_norm": 13.896410942077637, + "learning_rate": 8.077323414681408e-06, + "loss": 2.6939, + "step": 8091500 + }, + { + "epoch": 2.5155160273996025, + "grad_norm": 15.14448356628418, + "learning_rate": 8.074732876673293e-06, + "loss": 2.6172, + "step": 8092000 + }, + { + "epoch": 2.5156714596800893, + "grad_norm": 10.58230209350586, + "learning_rate": 8.072142338665179e-06, + "loss": 2.6171, + "step": 8092500 + }, + { + "epoch": 2.515826891960576, + "grad_norm": 14.355035781860352, + "learning_rate": 8.069551800657064e-06, + "loss": 2.626, + "step": 8093000 + }, + { + "epoch": 2.515982324241063, + "grad_norm": 13.214204788208008, + "learning_rate": 8.066961262648951e-06, + "loss": 2.6499, + "step": 8093500 + }, + { + "epoch": 2.51613775652155, + "grad_norm": 11.198990821838379, + "learning_rate": 8.064370724640835e-06, + "loss": 2.605, + "step": 8094000 + }, + { + "epoch": 2.516293188802037, + "grad_norm": 10.949589729309082, + "learning_rate": 8.06178018663272e-06, + "loss": 2.5931, + "step": 8094500 + }, + { + "epoch": 2.5164486210825237, + "grad_norm": 8.587993621826172, + "learning_rate": 8.059189648624606e-06, + "loss": 2.6155, + "step": 8095000 + }, + { + "epoch": 2.5166040533630105, + "grad_norm": 12.905284881591797, + "learning_rate": 8.056599110616491e-06, + "loss": 2.6403, + "step": 8095500 + }, + { + "epoch": 2.5167594856434974, + "grad_norm": 10.04100227355957, + "learning_rate": 8.054008572608377e-06, + "loss": 2.6306, + "step": 8096000 + }, + { + "epoch": 2.5169149179239843, + "grad_norm": 8.54885196685791, + "learning_rate": 8.051418034600262e-06, + "loss": 2.6573, + "step": 8096500 + }, + { + "epoch": 2.517070350204471, + "grad_norm": 11.178159713745117, + "learning_rate": 8.048827496592148e-06, + "loss": 2.5898, + "step": 8097000 + }, + { + "epoch": 2.517225782484958, + "grad_norm": 10.008538246154785, + "learning_rate": 8.046236958584033e-06, + "loss": 2.6101, + "step": 8097500 + }, + { + "epoch": 2.517381214765445, + "grad_norm": 8.9795560836792, + "learning_rate": 8.043646420575919e-06, + "loss": 2.6079, + "step": 8098000 + }, + { + "epoch": 2.5175366470459317, + "grad_norm": 16.33077621459961, + "learning_rate": 8.041055882567804e-06, + "loss": 2.636, + "step": 8098500 + }, + { + "epoch": 2.5176920793264186, + "grad_norm": 9.595792770385742, + "learning_rate": 8.03846534455969e-06, + "loss": 2.6032, + "step": 8099000 + }, + { + "epoch": 2.5178475116069055, + "grad_norm": 14.335075378417969, + "learning_rate": 8.035874806551575e-06, + "loss": 2.5916, + "step": 8099500 + }, + { + "epoch": 2.5180029438873923, + "grad_norm": 9.946885108947754, + "learning_rate": 8.03328426854346e-06, + "loss": 2.5903, + "step": 8100000 + }, + { + "epoch": 2.518158376167879, + "grad_norm": 9.458600044250488, + "learning_rate": 8.030693730535344e-06, + "loss": 2.5959, + "step": 8100500 + }, + { + "epoch": 2.518313808448366, + "grad_norm": 10.90911865234375, + "learning_rate": 8.028103192527231e-06, + "loss": 2.6168, + "step": 8101000 + }, + { + "epoch": 2.518469240728853, + "grad_norm": 10.250944137573242, + "learning_rate": 8.025512654519115e-06, + "loss": 2.5663, + "step": 8101500 + }, + { + "epoch": 2.51862467300934, + "grad_norm": 19.75604248046875, + "learning_rate": 8.022922116511002e-06, + "loss": 2.66, + "step": 8102000 + }, + { + "epoch": 2.5187801052898267, + "grad_norm": 16.180082321166992, + "learning_rate": 8.020331578502888e-06, + "loss": 2.569, + "step": 8102500 + }, + { + "epoch": 2.5189355375703135, + "grad_norm": 9.915266036987305, + "learning_rate": 8.017741040494772e-06, + "loss": 2.6793, + "step": 8103000 + }, + { + "epoch": 2.5190909698508004, + "grad_norm": 10.44381332397461, + "learning_rate": 8.015150502486659e-06, + "loss": 2.5367, + "step": 8103500 + }, + { + "epoch": 2.5192464021312873, + "grad_norm": 14.943709373474121, + "learning_rate": 8.012559964478542e-06, + "loss": 2.6328, + "step": 8104000 + }, + { + "epoch": 2.5194018344117746, + "grad_norm": 9.35802936553955, + "learning_rate": 8.00996942647043e-06, + "loss": 2.6209, + "step": 8104500 + }, + { + "epoch": 2.519557266692261, + "grad_norm": 15.6898193359375, + "learning_rate": 8.007378888462313e-06, + "loss": 2.6315, + "step": 8105000 + }, + { + "epoch": 2.5197126989727483, + "grad_norm": 8.37740421295166, + "learning_rate": 8.004788350454199e-06, + "loss": 2.6177, + "step": 8105500 + }, + { + "epoch": 2.5198681312532347, + "grad_norm": 15.22624397277832, + "learning_rate": 8.002197812446084e-06, + "loss": 2.5789, + "step": 8106000 + }, + { + "epoch": 2.520023563533722, + "grad_norm": 6.602203845977783, + "learning_rate": 7.99960727443797e-06, + "loss": 2.5791, + "step": 8106500 + }, + { + "epoch": 2.5201789958142085, + "grad_norm": 13.88569164276123, + "learning_rate": 7.997016736429857e-06, + "loss": 2.5878, + "step": 8107000 + }, + { + "epoch": 2.5203344280946958, + "grad_norm": 15.286507606506348, + "learning_rate": 7.99442619842174e-06, + "loss": 2.6001, + "step": 8107500 + }, + { + "epoch": 2.520489860375182, + "grad_norm": 9.876897811889648, + "learning_rate": 7.991835660413626e-06, + "loss": 2.6005, + "step": 8108000 + }, + { + "epoch": 2.5206452926556695, + "grad_norm": 8.477696418762207, + "learning_rate": 7.989245122405512e-06, + "loss": 2.5913, + "step": 8108500 + }, + { + "epoch": 2.520800724936156, + "grad_norm": 9.298032760620117, + "learning_rate": 7.986654584397397e-06, + "loss": 2.5835, + "step": 8109000 + }, + { + "epoch": 2.5209561572166432, + "grad_norm": 9.663846969604492, + "learning_rate": 7.984064046389283e-06, + "loss": 2.5841, + "step": 8109500 + }, + { + "epoch": 2.52111158949713, + "grad_norm": 9.832345962524414, + "learning_rate": 7.981473508381168e-06, + "loss": 2.5548, + "step": 8110000 + }, + { + "epoch": 2.521267021777617, + "grad_norm": 15.65277099609375, + "learning_rate": 7.978882970373053e-06, + "loss": 2.6311, + "step": 8110500 + }, + { + "epoch": 2.521422454058104, + "grad_norm": 11.647826194763184, + "learning_rate": 7.976292432364939e-06, + "loss": 2.6089, + "step": 8111000 + }, + { + "epoch": 2.5215778863385907, + "grad_norm": 24.2958984375, + "learning_rate": 7.973701894356824e-06, + "loss": 2.6376, + "step": 8111500 + }, + { + "epoch": 2.5217333186190776, + "grad_norm": 9.87450122833252, + "learning_rate": 7.97111135634871e-06, + "loss": 2.5993, + "step": 8112000 + }, + { + "epoch": 2.5218887508995644, + "grad_norm": 10.92211627960205, + "learning_rate": 7.968520818340595e-06, + "loss": 2.6643, + "step": 8112500 + }, + { + "epoch": 2.5220441831800513, + "grad_norm": 9.370071411132812, + "learning_rate": 7.96593028033248e-06, + "loss": 2.5929, + "step": 8113000 + }, + { + "epoch": 2.522199615460538, + "grad_norm": 39.99399948120117, + "learning_rate": 7.963339742324366e-06, + "loss": 2.6715, + "step": 8113500 + }, + { + "epoch": 2.522355047741025, + "grad_norm": 11.49185848236084, + "learning_rate": 7.96074920431625e-06, + "loss": 2.6037, + "step": 8114000 + }, + { + "epoch": 2.522510480021512, + "grad_norm": 12.846305847167969, + "learning_rate": 7.958158666308137e-06, + "loss": 2.6095, + "step": 8114500 + }, + { + "epoch": 2.5226659123019988, + "grad_norm": 10.558555603027344, + "learning_rate": 7.955568128300021e-06, + "loss": 2.5989, + "step": 8115000 + }, + { + "epoch": 2.5228213445824856, + "grad_norm": 8.00857925415039, + "learning_rate": 7.952977590291908e-06, + "loss": 2.6099, + "step": 8115500 + }, + { + "epoch": 2.5229767768629725, + "grad_norm": 9.207260131835938, + "learning_rate": 7.950387052283793e-06, + "loss": 2.6036, + "step": 8116000 + }, + { + "epoch": 2.5231322091434594, + "grad_norm": 10.049873352050781, + "learning_rate": 7.947796514275677e-06, + "loss": 2.5872, + "step": 8116500 + }, + { + "epoch": 2.5232876414239462, + "grad_norm": 16.715492248535156, + "learning_rate": 7.945205976267564e-06, + "loss": 2.5559, + "step": 8117000 + }, + { + "epoch": 2.523443073704433, + "grad_norm": 10.492286682128906, + "learning_rate": 7.942615438259448e-06, + "loss": 2.5393, + "step": 8117500 + }, + { + "epoch": 2.52359850598492, + "grad_norm": 22.17732810974121, + "learning_rate": 7.940024900251335e-06, + "loss": 2.5822, + "step": 8118000 + }, + { + "epoch": 2.523753938265407, + "grad_norm": 12.427825927734375, + "learning_rate": 7.937434362243219e-06, + "loss": 2.5824, + "step": 8118500 + }, + { + "epoch": 2.5239093705458937, + "grad_norm": 38.69115447998047, + "learning_rate": 7.934843824235105e-06, + "loss": 2.6126, + "step": 8119000 + }, + { + "epoch": 2.5240648028263806, + "grad_norm": 25.841758728027344, + "learning_rate": 7.93225328622699e-06, + "loss": 2.6239, + "step": 8119500 + }, + { + "epoch": 2.5242202351068674, + "grad_norm": 11.973304748535156, + "learning_rate": 7.929662748218875e-06, + "loss": 2.6086, + "step": 8120000 + }, + { + "epoch": 2.5243756673873543, + "grad_norm": 34.82142639160156, + "learning_rate": 7.927072210210763e-06, + "loss": 2.568, + "step": 8120500 + }, + { + "epoch": 2.524531099667841, + "grad_norm": 10.002700805664062, + "learning_rate": 7.924481672202646e-06, + "loss": 2.641, + "step": 8121000 + }, + { + "epoch": 2.524686531948328, + "grad_norm": 12.13956356048584, + "learning_rate": 7.921891134194532e-06, + "loss": 2.613, + "step": 8121500 + }, + { + "epoch": 2.524841964228815, + "grad_norm": 12.188050270080566, + "learning_rate": 7.919300596186417e-06, + "loss": 2.5894, + "step": 8122000 + }, + { + "epoch": 2.5249973965093018, + "grad_norm": 9.653507232666016, + "learning_rate": 7.916710058178303e-06, + "loss": 2.604, + "step": 8122500 + }, + { + "epoch": 2.5251528287897886, + "grad_norm": 9.399173736572266, + "learning_rate": 7.914119520170188e-06, + "loss": 2.5902, + "step": 8123000 + }, + { + "epoch": 2.5253082610702755, + "grad_norm": 10.165308952331543, + "learning_rate": 7.911528982162074e-06, + "loss": 2.6411, + "step": 8123500 + }, + { + "epoch": 2.5254636933507624, + "grad_norm": 10.96336841583252, + "learning_rate": 7.908938444153959e-06, + "loss": 2.5663, + "step": 8124000 + }, + { + "epoch": 2.5256191256312492, + "grad_norm": 10.885936737060547, + "learning_rate": 7.906347906145845e-06, + "loss": 2.6191, + "step": 8124500 + }, + { + "epoch": 2.525774557911736, + "grad_norm": 15.011577606201172, + "learning_rate": 7.90375736813773e-06, + "loss": 2.6159, + "step": 8125000 + }, + { + "epoch": 2.525929990192223, + "grad_norm": 27.759418487548828, + "learning_rate": 7.901166830129616e-06, + "loss": 2.5776, + "step": 8125500 + }, + { + "epoch": 2.52608542247271, + "grad_norm": 11.619479179382324, + "learning_rate": 7.898576292121501e-06, + "loss": 2.5962, + "step": 8126000 + }, + { + "epoch": 2.5262408547531967, + "grad_norm": 7.5977702140808105, + "learning_rate": 7.895985754113386e-06, + "loss": 2.5976, + "step": 8126500 + }, + { + "epoch": 2.5263962870336836, + "grad_norm": 10.83071231842041, + "learning_rate": 7.893395216105272e-06, + "loss": 2.6007, + "step": 8127000 + }, + { + "epoch": 2.5265517193141704, + "grad_norm": 8.321049690246582, + "learning_rate": 7.890804678097156e-06, + "loss": 2.6255, + "step": 8127500 + }, + { + "epoch": 2.5267071515946578, + "grad_norm": 12.509539604187012, + "learning_rate": 7.888214140089043e-06, + "loss": 2.623, + "step": 8128000 + }, + { + "epoch": 2.526862583875144, + "grad_norm": 12.477174758911133, + "learning_rate": 7.885623602080927e-06, + "loss": 2.5846, + "step": 8128500 + }, + { + "epoch": 2.5270180161556315, + "grad_norm": 8.053336143493652, + "learning_rate": 7.883033064072814e-06, + "loss": 2.6544, + "step": 8129000 + }, + { + "epoch": 2.527173448436118, + "grad_norm": 9.021810531616211, + "learning_rate": 7.8804425260647e-06, + "loss": 2.5873, + "step": 8129500 + }, + { + "epoch": 2.527328880716605, + "grad_norm": 10.49769115447998, + "learning_rate": 7.877851988056583e-06, + "loss": 2.6289, + "step": 8130000 + }, + { + "epoch": 2.5274843129970916, + "grad_norm": 8.81283950805664, + "learning_rate": 7.87526145004847e-06, + "loss": 2.6382, + "step": 8130500 + }, + { + "epoch": 2.527639745277579, + "grad_norm": 6.471630573272705, + "learning_rate": 7.872670912040354e-06, + "loss": 2.6387, + "step": 8131000 + }, + { + "epoch": 2.5277951775580654, + "grad_norm": 9.701960563659668, + "learning_rate": 7.870080374032241e-06, + "loss": 2.6468, + "step": 8131500 + }, + { + "epoch": 2.5279506098385527, + "grad_norm": 7.280743598937988, + "learning_rate": 7.867489836024125e-06, + "loss": 2.5668, + "step": 8132000 + }, + { + "epoch": 2.528106042119039, + "grad_norm": 10.796882629394531, + "learning_rate": 7.86489929801601e-06, + "loss": 2.5799, + "step": 8132500 + }, + { + "epoch": 2.5282614743995264, + "grad_norm": 15.21937370300293, + "learning_rate": 7.862308760007896e-06, + "loss": 2.6357, + "step": 8133000 + }, + { + "epoch": 2.528416906680013, + "grad_norm": 10.06947135925293, + "learning_rate": 7.859718221999781e-06, + "loss": 2.6507, + "step": 8133500 + }, + { + "epoch": 2.5285723389605, + "grad_norm": 6.328605651855469, + "learning_rate": 7.857127683991668e-06, + "loss": 2.5513, + "step": 8134000 + }, + { + "epoch": 2.528727771240987, + "grad_norm": 11.278197288513184, + "learning_rate": 7.854537145983552e-06, + "loss": 2.5923, + "step": 8134500 + }, + { + "epoch": 2.528883203521474, + "grad_norm": 10.677260398864746, + "learning_rate": 7.851946607975438e-06, + "loss": 2.6138, + "step": 8135000 + }, + { + "epoch": 2.5290386358019608, + "grad_norm": 8.160922050476074, + "learning_rate": 7.849356069967323e-06, + "loss": 2.6295, + "step": 8135500 + }, + { + "epoch": 2.5291940680824476, + "grad_norm": 32.124122619628906, + "learning_rate": 7.846765531959208e-06, + "loss": 2.5705, + "step": 8136000 + }, + { + "epoch": 2.5293495003629345, + "grad_norm": 10.688969612121582, + "learning_rate": 7.844174993951094e-06, + "loss": 2.5944, + "step": 8136500 + }, + { + "epoch": 2.5295049326434214, + "grad_norm": 6.3884663581848145, + "learning_rate": 7.84158445594298e-06, + "loss": 2.6267, + "step": 8137000 + }, + { + "epoch": 2.5296603649239082, + "grad_norm": 10.392560005187988, + "learning_rate": 7.838993917934865e-06, + "loss": 2.6143, + "step": 8137500 + }, + { + "epoch": 2.529815797204395, + "grad_norm": 8.666156768798828, + "learning_rate": 7.83640337992675e-06, + "loss": 2.659, + "step": 8138000 + }, + { + "epoch": 2.529971229484882, + "grad_norm": 7.643087863922119, + "learning_rate": 7.833812841918636e-06, + "loss": 2.6283, + "step": 8138500 + }, + { + "epoch": 2.530126661765369, + "grad_norm": 9.250725746154785, + "learning_rate": 7.831222303910521e-06, + "loss": 2.5866, + "step": 8139000 + }, + { + "epoch": 2.5302820940458557, + "grad_norm": 53.40584945678711, + "learning_rate": 7.828631765902407e-06, + "loss": 2.6069, + "step": 8139500 + }, + { + "epoch": 2.5304375263263426, + "grad_norm": 7.790058612823486, + "learning_rate": 7.826041227894292e-06, + "loss": 2.6218, + "step": 8140000 + }, + { + "epoch": 2.5305929586068294, + "grad_norm": 9.079414367675781, + "learning_rate": 7.823450689886178e-06, + "loss": 2.5771, + "step": 8140500 + }, + { + "epoch": 2.5307483908873163, + "grad_norm": 7.925078392028809, + "learning_rate": 7.820860151878061e-06, + "loss": 2.5864, + "step": 8141000 + }, + { + "epoch": 2.530903823167803, + "grad_norm": 9.30280876159668, + "learning_rate": 7.818269613869949e-06, + "loss": 2.5777, + "step": 8141500 + }, + { + "epoch": 2.53105925544829, + "grad_norm": 10.231706619262695, + "learning_rate": 7.815679075861832e-06, + "loss": 2.585, + "step": 8142000 + }, + { + "epoch": 2.531214687728777, + "grad_norm": 14.969414710998535, + "learning_rate": 7.81308853785372e-06, + "loss": 2.6423, + "step": 8142500 + }, + { + "epoch": 2.5313701200092638, + "grad_norm": 10.861579895019531, + "learning_rate": 7.810497999845605e-06, + "loss": 2.5955, + "step": 8143000 + }, + { + "epoch": 2.5315255522897506, + "grad_norm": 11.20456314086914, + "learning_rate": 7.807907461837489e-06, + "loss": 2.5578, + "step": 8143500 + }, + { + "epoch": 2.5316809845702375, + "grad_norm": 9.317791938781738, + "learning_rate": 7.805316923829376e-06, + "loss": 2.6236, + "step": 8144000 + }, + { + "epoch": 2.5318364168507244, + "grad_norm": 9.307833671569824, + "learning_rate": 7.80272638582126e-06, + "loss": 2.6419, + "step": 8144500 + }, + { + "epoch": 2.5319918491312112, + "grad_norm": 11.971848487854004, + "learning_rate": 7.800135847813147e-06, + "loss": 2.6055, + "step": 8145000 + }, + { + "epoch": 2.532147281411698, + "grad_norm": 13.710493087768555, + "learning_rate": 7.79754530980503e-06, + "loss": 2.6042, + "step": 8145500 + }, + { + "epoch": 2.532302713692185, + "grad_norm": 13.756932258605957, + "learning_rate": 7.794954771796916e-06, + "loss": 2.6276, + "step": 8146000 + }, + { + "epoch": 2.532458145972672, + "grad_norm": 11.09178638458252, + "learning_rate": 7.792364233788801e-06, + "loss": 2.5769, + "step": 8146500 + }, + { + "epoch": 2.5326135782531587, + "grad_norm": 12.341833114624023, + "learning_rate": 7.789773695780687e-06, + "loss": 2.6342, + "step": 8147000 + }, + { + "epoch": 2.5327690105336456, + "grad_norm": 8.51762580871582, + "learning_rate": 7.787183157772574e-06, + "loss": 2.5972, + "step": 8147500 + }, + { + "epoch": 2.5329244428141324, + "grad_norm": 12.818675994873047, + "learning_rate": 7.784592619764458e-06, + "loss": 2.5782, + "step": 8148000 + }, + { + "epoch": 2.5330798750946193, + "grad_norm": 12.771059036254883, + "learning_rate": 7.782002081756343e-06, + "loss": 2.6104, + "step": 8148500 + }, + { + "epoch": 2.533235307375106, + "grad_norm": 10.877591133117676, + "learning_rate": 7.779411543748229e-06, + "loss": 2.631, + "step": 8149000 + }, + { + "epoch": 2.533390739655593, + "grad_norm": 8.664549827575684, + "learning_rate": 7.776821005740114e-06, + "loss": 2.6004, + "step": 8149500 + }, + { + "epoch": 2.53354617193608, + "grad_norm": 7.867955207824707, + "learning_rate": 7.774230467732e-06, + "loss": 2.629, + "step": 8150000 + }, + { + "epoch": 2.5337016042165668, + "grad_norm": 8.77920150756836, + "learning_rate": 7.771639929723885e-06, + "loss": 2.5851, + "step": 8150500 + }, + { + "epoch": 2.5338570364970536, + "grad_norm": 8.831612586975098, + "learning_rate": 7.76904939171577e-06, + "loss": 2.5804, + "step": 8151000 + }, + { + "epoch": 2.5340124687775405, + "grad_norm": 8.696293830871582, + "learning_rate": 7.766458853707656e-06, + "loss": 2.618, + "step": 8151500 + }, + { + "epoch": 2.5341679010580274, + "grad_norm": 11.406960487365723, + "learning_rate": 7.763868315699541e-06, + "loss": 2.6164, + "step": 8152000 + }, + { + "epoch": 2.5343233333385147, + "grad_norm": 10.858932495117188, + "learning_rate": 7.761277777691427e-06, + "loss": 2.5837, + "step": 8152500 + }, + { + "epoch": 2.534478765619001, + "grad_norm": 9.049701690673828, + "learning_rate": 7.758687239683312e-06, + "loss": 2.5796, + "step": 8153000 + }, + { + "epoch": 2.5346341978994884, + "grad_norm": 10.790095329284668, + "learning_rate": 7.756096701675198e-06, + "loss": 2.6677, + "step": 8153500 + }, + { + "epoch": 2.534789630179975, + "grad_norm": 9.805624008178711, + "learning_rate": 7.753506163667083e-06, + "loss": 2.6136, + "step": 8154000 + }, + { + "epoch": 2.534945062460462, + "grad_norm": 10.856515884399414, + "learning_rate": 7.750915625658969e-06, + "loss": 2.5872, + "step": 8154500 + }, + { + "epoch": 2.5351004947409486, + "grad_norm": 8.23495101928711, + "learning_rate": 7.748325087650854e-06, + "loss": 2.5698, + "step": 8155000 + }, + { + "epoch": 2.535255927021436, + "grad_norm": 12.388507843017578, + "learning_rate": 7.745734549642738e-06, + "loss": 2.5721, + "step": 8155500 + }, + { + "epoch": 2.5354113593019223, + "grad_norm": 9.508694648742676, + "learning_rate": 7.743144011634625e-06, + "loss": 2.5926, + "step": 8156000 + }, + { + "epoch": 2.5355667915824096, + "grad_norm": 11.951297760009766, + "learning_rate": 7.74055347362651e-06, + "loss": 2.5982, + "step": 8156500 + }, + { + "epoch": 2.535722223862896, + "grad_norm": 10.808917999267578, + "learning_rate": 7.737962935618396e-06, + "loss": 2.5928, + "step": 8157000 + }, + { + "epoch": 2.5358776561433833, + "grad_norm": 9.863434791564941, + "learning_rate": 7.735372397610282e-06, + "loss": 2.6177, + "step": 8157500 + }, + { + "epoch": 2.53603308842387, + "grad_norm": 12.622912406921387, + "learning_rate": 7.732781859602165e-06, + "loss": 2.6174, + "step": 8158000 + }, + { + "epoch": 2.536188520704357, + "grad_norm": 10.183385848999023, + "learning_rate": 7.730191321594052e-06, + "loss": 2.5875, + "step": 8158500 + }, + { + "epoch": 2.536343952984844, + "grad_norm": 11.05162525177002, + "learning_rate": 7.727600783585936e-06, + "loss": 2.6442, + "step": 8159000 + }, + { + "epoch": 2.536499385265331, + "grad_norm": 9.518540382385254, + "learning_rate": 7.725010245577823e-06, + "loss": 2.6517, + "step": 8159500 + }, + { + "epoch": 2.5366548175458177, + "grad_norm": 13.661548614501953, + "learning_rate": 7.722419707569707e-06, + "loss": 2.5783, + "step": 8160000 + }, + { + "epoch": 2.5368102498263045, + "grad_norm": 9.73977279663086, + "learning_rate": 7.719829169561593e-06, + "loss": 2.603, + "step": 8160500 + }, + { + "epoch": 2.5369656821067914, + "grad_norm": 15.45311450958252, + "learning_rate": 7.71723863155348e-06, + "loss": 2.6244, + "step": 8161000 + }, + { + "epoch": 2.5371211143872783, + "grad_norm": 10.204269409179688, + "learning_rate": 7.714648093545364e-06, + "loss": 2.6955, + "step": 8161500 + }, + { + "epoch": 2.537276546667765, + "grad_norm": 8.209235191345215, + "learning_rate": 7.71205755553725e-06, + "loss": 2.6105, + "step": 8162000 + }, + { + "epoch": 2.537431978948252, + "grad_norm": 10.710893630981445, + "learning_rate": 7.709467017529134e-06, + "loss": 2.643, + "step": 8162500 + }, + { + "epoch": 2.537587411228739, + "grad_norm": 7.214621067047119, + "learning_rate": 7.70687647952102e-06, + "loss": 2.6167, + "step": 8163000 + }, + { + "epoch": 2.5377428435092257, + "grad_norm": 13.716434478759766, + "learning_rate": 7.704285941512905e-06, + "loss": 2.5939, + "step": 8163500 + }, + { + "epoch": 2.5378982757897126, + "grad_norm": 11.093544960021973, + "learning_rate": 7.70169540350479e-06, + "loss": 2.6681, + "step": 8164000 + }, + { + "epoch": 2.5380537080701995, + "grad_norm": 8.819168090820312, + "learning_rate": 7.699104865496676e-06, + "loss": 2.614, + "step": 8164500 + }, + { + "epoch": 2.5382091403506863, + "grad_norm": 16.898557662963867, + "learning_rate": 7.696514327488562e-06, + "loss": 2.5624, + "step": 8165000 + }, + { + "epoch": 2.538364572631173, + "grad_norm": 8.65599536895752, + "learning_rate": 7.693923789480447e-06, + "loss": 2.6041, + "step": 8165500 + }, + { + "epoch": 2.53852000491166, + "grad_norm": 8.912819862365723, + "learning_rate": 7.691333251472333e-06, + "loss": 2.6134, + "step": 8166000 + }, + { + "epoch": 2.538675437192147, + "grad_norm": 28.005155563354492, + "learning_rate": 7.688742713464218e-06, + "loss": 2.5564, + "step": 8166500 + }, + { + "epoch": 2.538830869472634, + "grad_norm": 10.215710639953613, + "learning_rate": 7.686152175456104e-06, + "loss": 2.6366, + "step": 8167000 + }, + { + "epoch": 2.5389863017531207, + "grad_norm": 8.892660140991211, + "learning_rate": 7.683561637447989e-06, + "loss": 2.6055, + "step": 8167500 + }, + { + "epoch": 2.5391417340336075, + "grad_norm": 17.748027801513672, + "learning_rate": 7.680971099439874e-06, + "loss": 2.6474, + "step": 8168000 + }, + { + "epoch": 2.5392971663140944, + "grad_norm": 9.41487979888916, + "learning_rate": 7.67838056143176e-06, + "loss": 2.6101, + "step": 8168500 + }, + { + "epoch": 2.5394525985945813, + "grad_norm": 5.51111364364624, + "learning_rate": 7.675790023423645e-06, + "loss": 2.6138, + "step": 8169000 + }, + { + "epoch": 2.539608030875068, + "grad_norm": 9.664793968200684, + "learning_rate": 7.673199485415531e-06, + "loss": 2.6191, + "step": 8169500 + }, + { + "epoch": 2.539763463155555, + "grad_norm": 6.9421210289001465, + "learning_rate": 7.670608947407416e-06, + "loss": 2.6121, + "step": 8170000 + }, + { + "epoch": 2.539918895436042, + "grad_norm": 6.27296781539917, + "learning_rate": 7.668018409399302e-06, + "loss": 2.5634, + "step": 8170500 + }, + { + "epoch": 2.5400743277165287, + "grad_norm": 9.266822814941406, + "learning_rate": 7.665427871391187e-06, + "loss": 2.6133, + "step": 8171000 + }, + { + "epoch": 2.5402297599970156, + "grad_norm": 10.534065246582031, + "learning_rate": 7.662837333383071e-06, + "loss": 2.6355, + "step": 8171500 + }, + { + "epoch": 2.5403851922775025, + "grad_norm": 8.656684875488281, + "learning_rate": 7.660246795374958e-06, + "loss": 2.5887, + "step": 8172000 + }, + { + "epoch": 2.5405406245579893, + "grad_norm": 8.658540725708008, + "learning_rate": 7.657656257366842e-06, + "loss": 2.5908, + "step": 8172500 + }, + { + "epoch": 2.540696056838476, + "grad_norm": 10.053603172302246, + "learning_rate": 7.655065719358729e-06, + "loss": 2.5782, + "step": 8173000 + }, + { + "epoch": 2.540851489118963, + "grad_norm": 10.555617332458496, + "learning_rate": 7.652475181350615e-06, + "loss": 2.6216, + "step": 8173500 + }, + { + "epoch": 2.54100692139945, + "grad_norm": 9.26319408416748, + "learning_rate": 7.649884643342498e-06, + "loss": 2.6039, + "step": 8174000 + }, + { + "epoch": 2.541162353679937, + "grad_norm": 10.709565162658691, + "learning_rate": 7.647294105334385e-06, + "loss": 2.6409, + "step": 8174500 + }, + { + "epoch": 2.5413177859604237, + "grad_norm": 9.619492530822754, + "learning_rate": 7.64470356732627e-06, + "loss": 2.5956, + "step": 8175000 + }, + { + "epoch": 2.5414732182409105, + "grad_norm": 17.787492752075195, + "learning_rate": 7.642113029318156e-06, + "loss": 2.6175, + "step": 8175500 + }, + { + "epoch": 2.541628650521398, + "grad_norm": 10.911993026733398, + "learning_rate": 7.63952249131004e-06, + "loss": 2.6096, + "step": 8176000 + }, + { + "epoch": 2.5417840828018843, + "grad_norm": 11.065884590148926, + "learning_rate": 7.636931953301926e-06, + "loss": 2.5806, + "step": 8176500 + }, + { + "epoch": 2.5419395150823716, + "grad_norm": 10.401777267456055, + "learning_rate": 7.634341415293811e-06, + "loss": 2.5514, + "step": 8177000 + }, + { + "epoch": 2.542094947362858, + "grad_norm": 20.328651428222656, + "learning_rate": 7.631750877285697e-06, + "loss": 2.6413, + "step": 8177500 + }, + { + "epoch": 2.5422503796433453, + "grad_norm": 9.065522193908691, + "learning_rate": 7.629160339277583e-06, + "loss": 2.5586, + "step": 8178000 + }, + { + "epoch": 2.5424058119238317, + "grad_norm": 7.733991622924805, + "learning_rate": 7.6265698012694674e-06, + "loss": 2.652, + "step": 8178500 + }, + { + "epoch": 2.542561244204319, + "grad_norm": 8.161446571350098, + "learning_rate": 7.623979263261354e-06, + "loss": 2.6295, + "step": 8179000 + }, + { + "epoch": 2.5427166764848055, + "grad_norm": 7.070693492889404, + "learning_rate": 7.621388725253238e-06, + "loss": 2.5532, + "step": 8179500 + }, + { + "epoch": 2.542872108765293, + "grad_norm": 9.894754409790039, + "learning_rate": 7.618798187245124e-06, + "loss": 2.5708, + "step": 8180000 + }, + { + "epoch": 2.543027541045779, + "grad_norm": 14.059164047241211, + "learning_rate": 7.6162076492370084e-06, + "loss": 2.5918, + "step": 8180500 + }, + { + "epoch": 2.5431829733262665, + "grad_norm": 11.018674850463867, + "learning_rate": 7.613617111228895e-06, + "loss": 2.6244, + "step": 8181000 + }, + { + "epoch": 2.543338405606753, + "grad_norm": 9.116135597229004, + "learning_rate": 7.611026573220779e-06, + "loss": 2.591, + "step": 8181500 + }, + { + "epoch": 2.5434938378872403, + "grad_norm": 10.556432723999023, + "learning_rate": 7.608436035212666e-06, + "loss": 2.5974, + "step": 8182000 + }, + { + "epoch": 2.543649270167727, + "grad_norm": 17.16054344177246, + "learning_rate": 7.605845497204551e-06, + "loss": 2.6028, + "step": 8182500 + }, + { + "epoch": 2.543804702448214, + "grad_norm": 9.361971855163574, + "learning_rate": 7.603254959196436e-06, + "loss": 2.6416, + "step": 8183000 + }, + { + "epoch": 2.543960134728701, + "grad_norm": 11.58646011352539, + "learning_rate": 7.600664421188322e-06, + "loss": 2.5868, + "step": 8183500 + }, + { + "epoch": 2.5441155670091877, + "grad_norm": 20.80316925048828, + "learning_rate": 7.598073883180207e-06, + "loss": 2.5739, + "step": 8184000 + }, + { + "epoch": 2.5442709992896746, + "grad_norm": 11.931253433227539, + "learning_rate": 7.595483345172093e-06, + "loss": 2.5968, + "step": 8184500 + }, + { + "epoch": 2.5444264315701615, + "grad_norm": 9.016319274902344, + "learning_rate": 7.5928928071639776e-06, + "loss": 2.592, + "step": 8185000 + }, + { + "epoch": 2.5445818638506483, + "grad_norm": 9.063504219055176, + "learning_rate": 7.590302269155863e-06, + "loss": 2.6039, + "step": 8185500 + }, + { + "epoch": 2.544737296131135, + "grad_norm": 17.511159896850586, + "learning_rate": 7.587711731147748e-06, + "loss": 2.5859, + "step": 8186000 + }, + { + "epoch": 2.544892728411622, + "grad_norm": 12.40265941619873, + "learning_rate": 7.585121193139634e-06, + "loss": 2.5865, + "step": 8186500 + }, + { + "epoch": 2.545048160692109, + "grad_norm": 9.6399564743042, + "learning_rate": 7.58253065513152e-06, + "loss": 2.6432, + "step": 8187000 + }, + { + "epoch": 2.545203592972596, + "grad_norm": 8.115999221801758, + "learning_rate": 7.579940117123405e-06, + "loss": 2.5723, + "step": 8187500 + }, + { + "epoch": 2.5453590252530827, + "grad_norm": 21.325225830078125, + "learning_rate": 7.57734957911529e-06, + "loss": 2.5708, + "step": 8188000 + }, + { + "epoch": 2.5455144575335695, + "grad_norm": 12.077388763427734, + "learning_rate": 7.574759041107175e-06, + "loss": 2.6025, + "step": 8188500 + }, + { + "epoch": 2.5456698898140564, + "grad_norm": 9.956525802612305, + "learning_rate": 7.572168503099061e-06, + "loss": 2.5914, + "step": 8189000 + }, + { + "epoch": 2.5458253220945433, + "grad_norm": 9.820789337158203, + "learning_rate": 7.569577965090946e-06, + "loss": 2.632, + "step": 8189500 + }, + { + "epoch": 2.54598075437503, + "grad_norm": 16.340917587280273, + "learning_rate": 7.566987427082832e-06, + "loss": 2.5337, + "step": 8190000 + }, + { + "epoch": 2.546136186655517, + "grad_norm": 25.90210723876953, + "learning_rate": 7.564396889074717e-06, + "loss": 2.6073, + "step": 8190500 + }, + { + "epoch": 2.546291618936004, + "grad_norm": 10.277787208557129, + "learning_rate": 7.561806351066602e-06, + "loss": 2.5864, + "step": 8191000 + }, + { + "epoch": 2.5464470512164907, + "grad_norm": 13.040691375732422, + "learning_rate": 7.5592158130584885e-06, + "loss": 2.614, + "step": 8191500 + }, + { + "epoch": 2.5466024834969776, + "grad_norm": 11.567152976989746, + "learning_rate": 7.556625275050373e-06, + "loss": 2.6084, + "step": 8192000 + }, + { + "epoch": 2.5467579157774645, + "grad_norm": 11.216090202331543, + "learning_rate": 7.5540347370422595e-06, + "loss": 2.626, + "step": 8192500 + }, + { + "epoch": 2.5469133480579513, + "grad_norm": 8.91242504119873, + "learning_rate": 7.551444199034144e-06, + "loss": 2.6403, + "step": 8193000 + }, + { + "epoch": 2.547068780338438, + "grad_norm": 10.493192672729492, + "learning_rate": 7.5488536610260295e-06, + "loss": 2.6203, + "step": 8193500 + }, + { + "epoch": 2.547224212618925, + "grad_norm": 9.916007995605469, + "learning_rate": 7.546263123017914e-06, + "loss": 2.6077, + "step": 8194000 + }, + { + "epoch": 2.547379644899412, + "grad_norm": 13.2861328125, + "learning_rate": 7.5436725850098004e-06, + "loss": 2.6453, + "step": 8194500 + }, + { + "epoch": 2.547535077179899, + "grad_norm": 20.812789916992188, + "learning_rate": 7.541082047001685e-06, + "loss": 2.6537, + "step": 8195000 + }, + { + "epoch": 2.5476905094603857, + "grad_norm": 10.181835174560547, + "learning_rate": 7.538491508993571e-06, + "loss": 2.5975, + "step": 8195500 + }, + { + "epoch": 2.5478459417408725, + "grad_norm": 27.185104370117188, + "learning_rate": 7.535900970985457e-06, + "loss": 2.5572, + "step": 8196000 + }, + { + "epoch": 2.5480013740213594, + "grad_norm": 10.195314407348633, + "learning_rate": 7.5333104329773414e-06, + "loss": 2.5864, + "step": 8196500 + }, + { + "epoch": 2.5481568063018463, + "grad_norm": 12.475146293640137, + "learning_rate": 7.530719894969228e-06, + "loss": 2.5735, + "step": 8197000 + }, + { + "epoch": 2.548312238582333, + "grad_norm": 9.81848430633545, + "learning_rate": 7.528129356961112e-06, + "loss": 2.6166, + "step": 8197500 + }, + { + "epoch": 2.54846767086282, + "grad_norm": 11.745502471923828, + "learning_rate": 7.525538818952999e-06, + "loss": 2.6351, + "step": 8198000 + }, + { + "epoch": 2.548623103143307, + "grad_norm": 8.373820304870605, + "learning_rate": 7.522948280944883e-06, + "loss": 2.6538, + "step": 8198500 + }, + { + "epoch": 2.5487785354237937, + "grad_norm": 11.091361045837402, + "learning_rate": 7.520357742936769e-06, + "loss": 2.6547, + "step": 8199000 + }, + { + "epoch": 2.5489339677042806, + "grad_norm": 16.90900993347168, + "learning_rate": 7.517767204928654e-06, + "loss": 2.609, + "step": 8199500 + }, + { + "epoch": 2.5490893999847675, + "grad_norm": 13.017641067504883, + "learning_rate": 7.51517666692054e-06, + "loss": 2.5471, + "step": 8200000 + }, + { + "epoch": 2.549244832265255, + "grad_norm": 11.945032119750977, + "learning_rate": 7.512586128912426e-06, + "loss": 2.6295, + "step": 8200500 + }, + { + "epoch": 2.549400264545741, + "grad_norm": 9.376221656799316, + "learning_rate": 7.5099955909043106e-06, + "loss": 2.6331, + "step": 8201000 + }, + { + "epoch": 2.5495556968262285, + "grad_norm": 12.836545944213867, + "learning_rate": 7.507405052896196e-06, + "loss": 2.5784, + "step": 8201500 + }, + { + "epoch": 2.549711129106715, + "grad_norm": 15.42180347442627, + "learning_rate": 7.5048145148880815e-06, + "loss": 2.6272, + "step": 8202000 + }, + { + "epoch": 2.5498665613872022, + "grad_norm": 9.694979667663574, + "learning_rate": 7.502223976879967e-06, + "loss": 2.5691, + "step": 8202500 + }, + { + "epoch": 2.5500219936676887, + "grad_norm": 11.742241859436035, + "learning_rate": 7.4996334388718516e-06, + "loss": 2.6058, + "step": 8203000 + }, + { + "epoch": 2.550177425948176, + "grad_norm": 9.403274536132812, + "learning_rate": 7.497042900863738e-06, + "loss": 2.5803, + "step": 8203500 + }, + { + "epoch": 2.5503328582286624, + "grad_norm": 11.537360191345215, + "learning_rate": 7.4944523628556225e-06, + "loss": 2.5698, + "step": 8204000 + }, + { + "epoch": 2.5504882905091497, + "grad_norm": 9.702737808227539, + "learning_rate": 7.491861824847509e-06, + "loss": 2.6157, + "step": 8204500 + }, + { + "epoch": 2.550643722789636, + "grad_norm": 8.615379333496094, + "learning_rate": 7.489271286839394e-06, + "loss": 2.6242, + "step": 8205000 + }, + { + "epoch": 2.5507991550701234, + "grad_norm": 10.2607421875, + "learning_rate": 7.486680748831279e-06, + "loss": 2.6379, + "step": 8205500 + }, + { + "epoch": 2.5509545873506103, + "grad_norm": 10.268379211425781, + "learning_rate": 7.484090210823165e-06, + "loss": 2.6186, + "step": 8206000 + }, + { + "epoch": 2.551110019631097, + "grad_norm": 9.331948280334473, + "learning_rate": 7.48149967281505e-06, + "loss": 2.6262, + "step": 8206500 + }, + { + "epoch": 2.551265451911584, + "grad_norm": 14.595244407653809, + "learning_rate": 7.478909134806936e-06, + "loss": 2.6133, + "step": 8207000 + }, + { + "epoch": 2.551420884192071, + "grad_norm": 13.661111831665039, + "learning_rate": 7.476318596798821e-06, + "loss": 2.6094, + "step": 8207500 + }, + { + "epoch": 2.551576316472558, + "grad_norm": 10.792611122131348, + "learning_rate": 7.473728058790706e-06, + "loss": 2.6018, + "step": 8208000 + }, + { + "epoch": 2.5517317487530446, + "grad_norm": 9.535724639892578, + "learning_rate": 7.471137520782591e-06, + "loss": 2.6072, + "step": 8208500 + }, + { + "epoch": 2.5518871810335315, + "grad_norm": 15.077856063842773, + "learning_rate": 7.468546982774477e-06, + "loss": 2.6114, + "step": 8209000 + }, + { + "epoch": 2.5520426133140184, + "grad_norm": 16.017337799072266, + "learning_rate": 7.465956444766363e-06, + "loss": 2.6135, + "step": 8209500 + }, + { + "epoch": 2.5521980455945052, + "grad_norm": 26.705921173095703, + "learning_rate": 7.463365906758248e-06, + "loss": 2.6215, + "step": 8210000 + }, + { + "epoch": 2.552353477874992, + "grad_norm": 11.004520416259766, + "learning_rate": 7.4607753687501334e-06, + "loss": 2.5876, + "step": 8210500 + }, + { + "epoch": 2.552508910155479, + "grad_norm": 11.161596298217773, + "learning_rate": 7.458184830742018e-06, + "loss": 2.5754, + "step": 8211000 + }, + { + "epoch": 2.552664342435966, + "grad_norm": 9.113155364990234, + "learning_rate": 7.455594292733904e-06, + "loss": 2.6147, + "step": 8211500 + }, + { + "epoch": 2.5528197747164527, + "grad_norm": 9.520030975341797, + "learning_rate": 7.453003754725789e-06, + "loss": 2.5763, + "step": 8212000 + }, + { + "epoch": 2.5529752069969396, + "grad_norm": 28.91440773010254, + "learning_rate": 7.450413216717675e-06, + "loss": 2.6042, + "step": 8212500 + }, + { + "epoch": 2.5531306392774265, + "grad_norm": 11.185842514038086, + "learning_rate": 7.44782267870956e-06, + "loss": 2.5494, + "step": 8213000 + }, + { + "epoch": 2.5532860715579133, + "grad_norm": 9.953495979309082, + "learning_rate": 7.445232140701445e-06, + "loss": 2.5892, + "step": 8213500 + }, + { + "epoch": 2.5534415038384, + "grad_norm": 9.293089866638184, + "learning_rate": 7.442641602693332e-06, + "loss": 2.6025, + "step": 8214000 + }, + { + "epoch": 2.553596936118887, + "grad_norm": 26.519868850708008, + "learning_rate": 7.440051064685216e-06, + "loss": 2.6459, + "step": 8214500 + }, + { + "epoch": 2.553752368399374, + "grad_norm": 9.540491104125977, + "learning_rate": 7.437460526677103e-06, + "loss": 2.6408, + "step": 8215000 + }, + { + "epoch": 2.553907800679861, + "grad_norm": 14.893844604492188, + "learning_rate": 7.434869988668987e-06, + "loss": 2.593, + "step": 8215500 + }, + { + "epoch": 2.5540632329603477, + "grad_norm": 8.168548583984375, + "learning_rate": 7.432279450660873e-06, + "loss": 2.572, + "step": 8216000 + }, + { + "epoch": 2.5542186652408345, + "grad_norm": 7.105481147766113, + "learning_rate": 7.429688912652757e-06, + "loss": 2.5736, + "step": 8216500 + }, + { + "epoch": 2.5543740975213214, + "grad_norm": 9.323372840881348, + "learning_rate": 7.4270983746446436e-06, + "loss": 2.5861, + "step": 8217000 + }, + { + "epoch": 2.5545295298018083, + "grad_norm": 9.226448059082031, + "learning_rate": 7.424507836636528e-06, + "loss": 2.5982, + "step": 8217500 + }, + { + "epoch": 2.554684962082295, + "grad_norm": 7.827169895172119, + "learning_rate": 7.4219172986284145e-06, + "loss": 2.605, + "step": 8218000 + }, + { + "epoch": 2.554840394362782, + "grad_norm": 28.561269760131836, + "learning_rate": 7.4193267606203e-06, + "loss": 2.6385, + "step": 8218500 + }, + { + "epoch": 2.554995826643269, + "grad_norm": 9.52683162689209, + "learning_rate": 7.4167362226121846e-06, + "loss": 2.5713, + "step": 8219000 + }, + { + "epoch": 2.5551512589237557, + "grad_norm": 12.616253852844238, + "learning_rate": 7.414145684604071e-06, + "loss": 2.623, + "step": 8219500 + }, + { + "epoch": 2.5553066912042426, + "grad_norm": 8.004280090332031, + "learning_rate": 7.4115551465959555e-06, + "loss": 2.5934, + "step": 8220000 + }, + { + "epoch": 2.5554621234847295, + "grad_norm": 8.660505294799805, + "learning_rate": 7.408964608587842e-06, + "loss": 2.6128, + "step": 8220500 + }, + { + "epoch": 2.5556175557652163, + "grad_norm": 25.212112426757812, + "learning_rate": 7.406374070579726e-06, + "loss": 2.6319, + "step": 8221000 + }, + { + "epoch": 2.555772988045703, + "grad_norm": 12.421100616455078, + "learning_rate": 7.403783532571612e-06, + "loss": 2.566, + "step": 8221500 + }, + { + "epoch": 2.55592842032619, + "grad_norm": 17.66309928894043, + "learning_rate": 7.4011929945634965e-06, + "loss": 2.5958, + "step": 8222000 + }, + { + "epoch": 2.556083852606677, + "grad_norm": 10.159113883972168, + "learning_rate": 7.398602456555383e-06, + "loss": 2.6363, + "step": 8222500 + }, + { + "epoch": 2.556239284887164, + "grad_norm": 9.41918659210205, + "learning_rate": 7.396011918547269e-06, + "loss": 2.5748, + "step": 8223000 + }, + { + "epoch": 2.5563947171676507, + "grad_norm": 45.09393310546875, + "learning_rate": 7.393421380539154e-06, + "loss": 2.6383, + "step": 8223500 + }, + { + "epoch": 2.556550149448138, + "grad_norm": 16.20643424987793, + "learning_rate": 7.390830842531039e-06, + "loss": 2.6278, + "step": 8224000 + }, + { + "epoch": 2.5567055817286244, + "grad_norm": 11.152857780456543, + "learning_rate": 7.388240304522924e-06, + "loss": 2.6251, + "step": 8224500 + }, + { + "epoch": 2.5568610140091117, + "grad_norm": 13.713282585144043, + "learning_rate": 7.38564976651481e-06, + "loss": 2.6189, + "step": 8225000 + }, + { + "epoch": 2.557016446289598, + "grad_norm": 11.92130184173584, + "learning_rate": 7.383059228506695e-06, + "loss": 2.5875, + "step": 8225500 + }, + { + "epoch": 2.5571718785700854, + "grad_norm": 10.732959747314453, + "learning_rate": 7.380468690498581e-06, + "loss": 2.5904, + "step": 8226000 + }, + { + "epoch": 2.557327310850572, + "grad_norm": 9.493596076965332, + "learning_rate": 7.377878152490466e-06, + "loss": 2.6275, + "step": 8226500 + }, + { + "epoch": 2.557482743131059, + "grad_norm": 10.211524963378906, + "learning_rate": 7.375287614482351e-06, + "loss": 2.6097, + "step": 8227000 + }, + { + "epoch": 2.5576381754115456, + "grad_norm": 8.215271949768066, + "learning_rate": 7.372697076474237e-06, + "loss": 2.6004, + "step": 8227500 + }, + { + "epoch": 2.557793607692033, + "grad_norm": 11.19320011138916, + "learning_rate": 7.370106538466122e-06, + "loss": 2.5736, + "step": 8228000 + }, + { + "epoch": 2.5579490399725193, + "grad_norm": 12.69838809967041, + "learning_rate": 7.367516000458008e-06, + "loss": 2.6284, + "step": 8228500 + }, + { + "epoch": 2.5581044722530066, + "grad_norm": 28.599685668945312, + "learning_rate": 7.364925462449893e-06, + "loss": 2.617, + "step": 8229000 + }, + { + "epoch": 2.558259904533493, + "grad_norm": 8.730783462524414, + "learning_rate": 7.362334924441778e-06, + "loss": 2.6147, + "step": 8229500 + }, + { + "epoch": 2.5584153368139804, + "grad_norm": 9.865683555603027, + "learning_rate": 7.359744386433663e-06, + "loss": 2.5597, + "step": 8230000 + }, + { + "epoch": 2.5585707690944672, + "grad_norm": 12.004069328308105, + "learning_rate": 7.357153848425549e-06, + "loss": 2.6802, + "step": 8230500 + }, + { + "epoch": 2.558726201374954, + "grad_norm": 9.593363761901855, + "learning_rate": 7.354563310417434e-06, + "loss": 2.6032, + "step": 8231000 + }, + { + "epoch": 2.558881633655441, + "grad_norm": 10.522798538208008, + "learning_rate": 7.35197277240932e-06, + "loss": 2.6413, + "step": 8231500 + }, + { + "epoch": 2.559037065935928, + "grad_norm": 9.303056716918945, + "learning_rate": 7.349382234401206e-06, + "loss": 2.5714, + "step": 8232000 + }, + { + "epoch": 2.5591924982164147, + "grad_norm": 9.533048629760742, + "learning_rate": 7.34679169639309e-06, + "loss": 2.6053, + "step": 8232500 + }, + { + "epoch": 2.5593479304969016, + "grad_norm": 9.392561912536621, + "learning_rate": 7.3442011583849766e-06, + "loss": 2.5828, + "step": 8233000 + }, + { + "epoch": 2.5595033627773884, + "grad_norm": 9.21910572052002, + "learning_rate": 7.341610620376861e-06, + "loss": 2.5363, + "step": 8233500 + }, + { + "epoch": 2.5596587950578753, + "grad_norm": 8.827605247497559, + "learning_rate": 7.3390200823687475e-06, + "loss": 2.6293, + "step": 8234000 + }, + { + "epoch": 2.559814227338362, + "grad_norm": 11.77977466583252, + "learning_rate": 7.336429544360632e-06, + "loss": 2.5844, + "step": 8234500 + }, + { + "epoch": 2.559969659618849, + "grad_norm": 11.064496994018555, + "learning_rate": 7.3338390063525176e-06, + "loss": 2.5921, + "step": 8235000 + }, + { + "epoch": 2.560125091899336, + "grad_norm": 22.563989639282227, + "learning_rate": 7.331248468344402e-06, + "loss": 2.6004, + "step": 8235500 + }, + { + "epoch": 2.5602805241798228, + "grad_norm": 26.74777603149414, + "learning_rate": 7.3286579303362885e-06, + "loss": 2.6131, + "step": 8236000 + }, + { + "epoch": 2.5604359564603096, + "grad_norm": 10.69764232635498, + "learning_rate": 7.326067392328175e-06, + "loss": 2.5674, + "step": 8236500 + }, + { + "epoch": 2.5605913887407965, + "grad_norm": 10.37037467956543, + "learning_rate": 7.323476854320059e-06, + "loss": 2.6113, + "step": 8237000 + }, + { + "epoch": 2.5607468210212834, + "grad_norm": 8.672229766845703, + "learning_rate": 7.320886316311945e-06, + "loss": 2.5751, + "step": 8237500 + }, + { + "epoch": 2.5609022533017702, + "grad_norm": 11.701895713806152, + "learning_rate": 7.3182957783038295e-06, + "loss": 2.6026, + "step": 8238000 + }, + { + "epoch": 2.561057685582257, + "grad_norm": 10.142619132995605, + "learning_rate": 7.315705240295716e-06, + "loss": 2.6179, + "step": 8238500 + }, + { + "epoch": 2.561213117862744, + "grad_norm": 9.944328308105469, + "learning_rate": 7.3131147022876e-06, + "loss": 2.5683, + "step": 8239000 + }, + { + "epoch": 2.561368550143231, + "grad_norm": 7.731613636016846, + "learning_rate": 7.310524164279487e-06, + "loss": 2.614, + "step": 8239500 + }, + { + "epoch": 2.5615239824237177, + "grad_norm": 11.289143562316895, + "learning_rate": 7.307933626271371e-06, + "loss": 2.6158, + "step": 8240000 + }, + { + "epoch": 2.5616794147042046, + "grad_norm": 9.88925838470459, + "learning_rate": 7.305343088263257e-06, + "loss": 2.623, + "step": 8240500 + }, + { + "epoch": 2.5618348469846914, + "grad_norm": 10.241586685180664, + "learning_rate": 7.302752550255143e-06, + "loss": 2.5987, + "step": 8241000 + }, + { + "epoch": 2.5619902792651783, + "grad_norm": 11.323216438293457, + "learning_rate": 7.300162012247028e-06, + "loss": 2.5961, + "step": 8241500 + }, + { + "epoch": 2.562145711545665, + "grad_norm": 9.5556001663208, + "learning_rate": 7.297571474238914e-06, + "loss": 2.5725, + "step": 8242000 + }, + { + "epoch": 2.562301143826152, + "grad_norm": 10.607611656188965, + "learning_rate": 7.294980936230799e-06, + "loss": 2.6079, + "step": 8242500 + }, + { + "epoch": 2.562456576106639, + "grad_norm": 11.46381664276123, + "learning_rate": 7.292390398222684e-06, + "loss": 2.5831, + "step": 8243000 + }, + { + "epoch": 2.5626120083871258, + "grad_norm": 9.67904281616211, + "learning_rate": 7.289799860214569e-06, + "loss": 2.6167, + "step": 8243500 + }, + { + "epoch": 2.5627674406676126, + "grad_norm": 8.916491508483887, + "learning_rate": 7.287209322206455e-06, + "loss": 2.6322, + "step": 8244000 + }, + { + "epoch": 2.5629228729480995, + "grad_norm": 8.987576484680176, + "learning_rate": 7.28461878419834e-06, + "loss": 2.6009, + "step": 8244500 + }, + { + "epoch": 2.5630783052285864, + "grad_norm": 13.130693435668945, + "learning_rate": 7.282028246190226e-06, + "loss": 2.6244, + "step": 8245000 + }, + { + "epoch": 2.5632337375090732, + "grad_norm": 8.568435668945312, + "learning_rate": 7.279437708182111e-06, + "loss": 2.6045, + "step": 8245500 + }, + { + "epoch": 2.56338916978956, + "grad_norm": 9.762197494506836, + "learning_rate": 7.276847170173996e-06, + "loss": 2.6072, + "step": 8246000 + }, + { + "epoch": 2.563544602070047, + "grad_norm": 7.701026439666748, + "learning_rate": 7.274256632165882e-06, + "loss": 2.5931, + "step": 8246500 + }, + { + "epoch": 2.563700034350534, + "grad_norm": 10.059914588928223, + "learning_rate": 7.271666094157767e-06, + "loss": 2.5961, + "step": 8247000 + }, + { + "epoch": 2.5638554666310207, + "grad_norm": 6.908936023712158, + "learning_rate": 7.269075556149653e-06, + "loss": 2.5976, + "step": 8247500 + }, + { + "epoch": 2.5640108989115076, + "grad_norm": 8.07271671295166, + "learning_rate": 7.266485018141538e-06, + "loss": 2.6046, + "step": 8248000 + }, + { + "epoch": 2.564166331191995, + "grad_norm": 23.2513370513916, + "learning_rate": 7.263894480133423e-06, + "loss": 2.5804, + "step": 8248500 + }, + { + "epoch": 2.5643217634724813, + "grad_norm": 6.279652118682861, + "learning_rate": 7.261303942125308e-06, + "loss": 2.6143, + "step": 8249000 + }, + { + "epoch": 2.5644771957529686, + "grad_norm": 9.137741088867188, + "learning_rate": 7.258713404117194e-06, + "loss": 2.6208, + "step": 8249500 + }, + { + "epoch": 2.564632628033455, + "grad_norm": 10.535811424255371, + "learning_rate": 7.2561228661090805e-06, + "loss": 2.5796, + "step": 8250000 + }, + { + "epoch": 2.5647880603139424, + "grad_norm": 9.477161407470703, + "learning_rate": 7.253532328100965e-06, + "loss": 2.6415, + "step": 8250500 + }, + { + "epoch": 2.5649434925944288, + "grad_norm": 10.868423461914062, + "learning_rate": 7.2509417900928506e-06, + "loss": 2.6139, + "step": 8251000 + }, + { + "epoch": 2.565098924874916, + "grad_norm": 40.004234313964844, + "learning_rate": 7.248351252084735e-06, + "loss": 2.6035, + "step": 8251500 + }, + { + "epoch": 2.5652543571554025, + "grad_norm": 11.075998306274414, + "learning_rate": 7.2457607140766215e-06, + "loss": 2.6377, + "step": 8252000 + }, + { + "epoch": 2.56540978943589, + "grad_norm": 10.727316856384277, + "learning_rate": 7.243170176068506e-06, + "loss": 2.674, + "step": 8252500 + }, + { + "epoch": 2.5655652217163762, + "grad_norm": 10.773099899291992, + "learning_rate": 7.240579638060392e-06, + "loss": 2.5944, + "step": 8253000 + }, + { + "epoch": 2.5657206539968636, + "grad_norm": 10.204127311706543, + "learning_rate": 7.237989100052277e-06, + "loss": 2.5857, + "step": 8253500 + }, + { + "epoch": 2.56587608627735, + "grad_norm": 10.842013359069824, + "learning_rate": 7.2353985620441625e-06, + "loss": 2.6103, + "step": 8254000 + }, + { + "epoch": 2.5660315185578373, + "grad_norm": 27.467378616333008, + "learning_rate": 7.232808024036049e-06, + "loss": 2.6189, + "step": 8254500 + }, + { + "epoch": 2.566186950838324, + "grad_norm": 9.529592514038086, + "learning_rate": 7.230217486027933e-06, + "loss": 2.52, + "step": 8255000 + }, + { + "epoch": 2.566342383118811, + "grad_norm": 16.481233596801758, + "learning_rate": 7.22762694801982e-06, + "loss": 2.6371, + "step": 8255500 + }, + { + "epoch": 2.566497815399298, + "grad_norm": 10.536101341247559, + "learning_rate": 7.225036410011704e-06, + "loss": 2.5237, + "step": 8256000 + }, + { + "epoch": 2.5666532476797848, + "grad_norm": 8.645468711853027, + "learning_rate": 7.22244587200359e-06, + "loss": 2.5983, + "step": 8256500 + }, + { + "epoch": 2.5668086799602716, + "grad_norm": 8.91834831237793, + "learning_rate": 7.219855333995474e-06, + "loss": 2.5604, + "step": 8257000 + }, + { + "epoch": 2.5669641122407585, + "grad_norm": 16.08692741394043, + "learning_rate": 7.217264795987361e-06, + "loss": 2.6116, + "step": 8257500 + }, + { + "epoch": 2.5671195445212454, + "grad_norm": 14.397798538208008, + "learning_rate": 7.214674257979245e-06, + "loss": 2.5419, + "step": 8258000 + }, + { + "epoch": 2.5672749768017322, + "grad_norm": 17.260835647583008, + "learning_rate": 7.212083719971132e-06, + "loss": 2.601, + "step": 8258500 + }, + { + "epoch": 2.567430409082219, + "grad_norm": 13.129103660583496, + "learning_rate": 7.209493181963017e-06, + "loss": 2.5901, + "step": 8259000 + }, + { + "epoch": 2.567585841362706, + "grad_norm": 9.864009857177734, + "learning_rate": 7.206902643954902e-06, + "loss": 2.5598, + "step": 8259500 + }, + { + "epoch": 2.567741273643193, + "grad_norm": 7.957677364349365, + "learning_rate": 7.204312105946788e-06, + "loss": 2.6032, + "step": 8260000 + }, + { + "epoch": 2.5678967059236797, + "grad_norm": 10.055936813354492, + "learning_rate": 7.201721567938673e-06, + "loss": 2.595, + "step": 8260500 + }, + { + "epoch": 2.5680521382041666, + "grad_norm": 45.60689926147461, + "learning_rate": 7.199131029930559e-06, + "loss": 2.6193, + "step": 8261000 + }, + { + "epoch": 2.5682075704846534, + "grad_norm": 9.929609298706055, + "learning_rate": 7.1965404919224435e-06, + "loss": 2.6168, + "step": 8261500 + }, + { + "epoch": 2.5683630027651403, + "grad_norm": 10.758578300476074, + "learning_rate": 7.193949953914329e-06, + "loss": 2.6615, + "step": 8262000 + }, + { + "epoch": 2.568518435045627, + "grad_norm": 10.228370666503906, + "learning_rate": 7.191359415906214e-06, + "loss": 2.5992, + "step": 8262500 + }, + { + "epoch": 2.568673867326114, + "grad_norm": 10.637198448181152, + "learning_rate": 7.1887688778981e-06, + "loss": 2.5853, + "step": 8263000 + }, + { + "epoch": 2.568829299606601, + "grad_norm": 8.935498237609863, + "learning_rate": 7.186178339889986e-06, + "loss": 2.6142, + "step": 8263500 + }, + { + "epoch": 2.5689847318870878, + "grad_norm": 8.188820838928223, + "learning_rate": 7.183587801881871e-06, + "loss": 2.6326, + "step": 8264000 + }, + { + "epoch": 2.5691401641675746, + "grad_norm": 10.851040840148926, + "learning_rate": 7.180997263873756e-06, + "loss": 2.6574, + "step": 8264500 + }, + { + "epoch": 2.5692955964480615, + "grad_norm": 11.814241409301758, + "learning_rate": 7.178406725865641e-06, + "loss": 2.59, + "step": 8265000 + }, + { + "epoch": 2.5694510287285484, + "grad_norm": 13.007099151611328, + "learning_rate": 7.175816187857527e-06, + "loss": 2.6253, + "step": 8265500 + }, + { + "epoch": 2.5696064610090352, + "grad_norm": 10.686391830444336, + "learning_rate": 7.173225649849412e-06, + "loss": 2.5651, + "step": 8266000 + }, + { + "epoch": 2.569761893289522, + "grad_norm": 6.532851696014404, + "learning_rate": 7.170635111841298e-06, + "loss": 2.5709, + "step": 8266500 + }, + { + "epoch": 2.569917325570009, + "grad_norm": 10.104896545410156, + "learning_rate": 7.168044573833183e-06, + "loss": 2.5913, + "step": 8267000 + }, + { + "epoch": 2.570072757850496, + "grad_norm": 9.259967803955078, + "learning_rate": 7.165454035825068e-06, + "loss": 2.5731, + "step": 8267500 + }, + { + "epoch": 2.5702281901309827, + "grad_norm": 9.36861801147461, + "learning_rate": 7.1628634978169545e-06, + "loss": 2.5769, + "step": 8268000 + }, + { + "epoch": 2.5703836224114696, + "grad_norm": 10.044987678527832, + "learning_rate": 7.160272959808839e-06, + "loss": 2.6078, + "step": 8268500 + }, + { + "epoch": 2.5705390546919564, + "grad_norm": 10.584217071533203, + "learning_rate": 7.157682421800725e-06, + "loss": 2.6608, + "step": 8269000 + }, + { + "epoch": 2.5706944869724433, + "grad_norm": 10.051590919494629, + "learning_rate": 7.15509188379261e-06, + "loss": 2.5874, + "step": 8269500 + }, + { + "epoch": 2.57084991925293, + "grad_norm": 11.788219451904297, + "learning_rate": 7.1525013457844955e-06, + "loss": 2.6149, + "step": 8270000 + }, + { + "epoch": 2.571005351533417, + "grad_norm": 38.127235412597656, + "learning_rate": 7.14991080777638e-06, + "loss": 2.6278, + "step": 8270500 + }, + { + "epoch": 2.571160783813904, + "grad_norm": 8.763999938964844, + "learning_rate": 7.147320269768266e-06, + "loss": 2.5887, + "step": 8271000 + }, + { + "epoch": 2.5713162160943908, + "grad_norm": 10.216907501220703, + "learning_rate": 7.144729731760151e-06, + "loss": 2.6236, + "step": 8271500 + }, + { + "epoch": 2.5714716483748776, + "grad_norm": 10.31080150604248, + "learning_rate": 7.142139193752037e-06, + "loss": 2.5859, + "step": 8272000 + }, + { + "epoch": 2.5716270806553645, + "grad_norm": 8.359864234924316, + "learning_rate": 7.139548655743923e-06, + "loss": 2.608, + "step": 8272500 + }, + { + "epoch": 2.571782512935852, + "grad_norm": 10.47052001953125, + "learning_rate": 7.136958117735807e-06, + "loss": 2.609, + "step": 8273000 + }, + { + "epoch": 2.5719379452163382, + "grad_norm": 9.434427261352539, + "learning_rate": 7.134367579727694e-06, + "loss": 2.5937, + "step": 8273500 + }, + { + "epoch": 2.5720933774968255, + "grad_norm": 27.714401245117188, + "learning_rate": 7.131777041719578e-06, + "loss": 2.6003, + "step": 8274000 + }, + { + "epoch": 2.572248809777312, + "grad_norm": 13.365077018737793, + "learning_rate": 7.129186503711465e-06, + "loss": 2.5965, + "step": 8274500 + }, + { + "epoch": 2.5724042420577993, + "grad_norm": 11.803712844848633, + "learning_rate": 7.126595965703349e-06, + "loss": 2.5823, + "step": 8275000 + }, + { + "epoch": 2.5725596743382857, + "grad_norm": 11.36325740814209, + "learning_rate": 7.124005427695235e-06, + "loss": 2.5947, + "step": 8275500 + }, + { + "epoch": 2.572715106618773, + "grad_norm": 16.525985717773438, + "learning_rate": 7.121414889687119e-06, + "loss": 2.6424, + "step": 8276000 + }, + { + "epoch": 2.5728705388992594, + "grad_norm": 9.649624824523926, + "learning_rate": 7.118824351679006e-06, + "loss": 2.6187, + "step": 8276500 + }, + { + "epoch": 2.5730259711797467, + "grad_norm": 24.4836483001709, + "learning_rate": 7.116233813670892e-06, + "loss": 2.5769, + "step": 8277000 + }, + { + "epoch": 2.573181403460233, + "grad_norm": 78.93767547607422, + "learning_rate": 7.1136432756627765e-06, + "loss": 2.5968, + "step": 8277500 + }, + { + "epoch": 2.5733368357407205, + "grad_norm": 10.394143104553223, + "learning_rate": 7.111052737654662e-06, + "loss": 2.5776, + "step": 8278000 + }, + { + "epoch": 2.5734922680212073, + "grad_norm": 10.054924964904785, + "learning_rate": 7.108462199646547e-06, + "loss": 2.5525, + "step": 8278500 + }, + { + "epoch": 2.573647700301694, + "grad_norm": 9.704644203186035, + "learning_rate": 7.105871661638433e-06, + "loss": 2.6554, + "step": 8279000 + }, + { + "epoch": 2.573803132582181, + "grad_norm": 11.521234512329102, + "learning_rate": 7.1032811236303175e-06, + "loss": 2.5667, + "step": 8279500 + }, + { + "epoch": 2.573958564862668, + "grad_norm": 10.557570457458496, + "learning_rate": 7.100690585622204e-06, + "loss": 2.5633, + "step": 8280000 + }, + { + "epoch": 2.574113997143155, + "grad_norm": 10.577905654907227, + "learning_rate": 7.0981000476140884e-06, + "loss": 2.5704, + "step": 8280500 + }, + { + "epoch": 2.5742694294236417, + "grad_norm": 13.112614631652832, + "learning_rate": 7.095509509605974e-06, + "loss": 2.6112, + "step": 8281000 + }, + { + "epoch": 2.5744248617041285, + "grad_norm": 12.276103973388672, + "learning_rate": 7.09291897159786e-06, + "loss": 2.6133, + "step": 8281500 + }, + { + "epoch": 2.5745802939846154, + "grad_norm": 13.859746932983398, + "learning_rate": 7.090328433589745e-06, + "loss": 2.5998, + "step": 8282000 + }, + { + "epoch": 2.5747357262651023, + "grad_norm": 10.275481224060059, + "learning_rate": 7.087737895581631e-06, + "loss": 2.6134, + "step": 8282500 + }, + { + "epoch": 2.574891158545589, + "grad_norm": 9.68271255493164, + "learning_rate": 7.085147357573516e-06, + "loss": 2.6132, + "step": 8283000 + }, + { + "epoch": 2.575046590826076, + "grad_norm": 10.29710865020752, + "learning_rate": 7.082556819565401e-06, + "loss": 2.6442, + "step": 8283500 + }, + { + "epoch": 2.575202023106563, + "grad_norm": 12.126941680908203, + "learning_rate": 7.079966281557286e-06, + "loss": 2.5925, + "step": 8284000 + }, + { + "epoch": 2.5753574553870497, + "grad_norm": 13.44020938873291, + "learning_rate": 7.077375743549172e-06, + "loss": 2.6129, + "step": 8284500 + }, + { + "epoch": 2.5755128876675366, + "grad_norm": 8.431970596313477, + "learning_rate": 7.074785205541057e-06, + "loss": 2.5939, + "step": 8285000 + }, + { + "epoch": 2.5756683199480235, + "grad_norm": 20.665904998779297, + "learning_rate": 7.072194667532943e-06, + "loss": 2.6141, + "step": 8285500 + }, + { + "epoch": 2.5758237522285103, + "grad_norm": 7.704147815704346, + "learning_rate": 7.0696041295248285e-06, + "loss": 2.5912, + "step": 8286000 + }, + { + "epoch": 2.575979184508997, + "grad_norm": 9.558913230895996, + "learning_rate": 7.067013591516713e-06, + "loss": 2.6401, + "step": 8286500 + }, + { + "epoch": 2.576134616789484, + "grad_norm": 10.35060977935791, + "learning_rate": 7.064423053508599e-06, + "loss": 2.5664, + "step": 8287000 + }, + { + "epoch": 2.576290049069971, + "grad_norm": 10.46481704711914, + "learning_rate": 7.061832515500484e-06, + "loss": 2.622, + "step": 8287500 + }, + { + "epoch": 2.576445481350458, + "grad_norm": 9.217860221862793, + "learning_rate": 7.05924197749237e-06, + "loss": 2.6125, + "step": 8288000 + }, + { + "epoch": 2.5766009136309447, + "grad_norm": 9.121726036071777, + "learning_rate": 7.056651439484255e-06, + "loss": 2.5777, + "step": 8288500 + }, + { + "epoch": 2.5767563459114315, + "grad_norm": 14.298592567443848, + "learning_rate": 7.05406090147614e-06, + "loss": 2.6254, + "step": 8289000 + }, + { + "epoch": 2.5769117781919184, + "grad_norm": 9.395142555236816, + "learning_rate": 7.051470363468025e-06, + "loss": 2.6336, + "step": 8289500 + }, + { + "epoch": 2.5770672104724053, + "grad_norm": 9.941899299621582, + "learning_rate": 7.048879825459911e-06, + "loss": 2.6024, + "step": 8290000 + }, + { + "epoch": 2.577222642752892, + "grad_norm": 9.289064407348633, + "learning_rate": 7.046289287451798e-06, + "loss": 2.6106, + "step": 8290500 + }, + { + "epoch": 2.577378075033379, + "grad_norm": 11.769881248474121, + "learning_rate": 7.043698749443682e-06, + "loss": 2.5682, + "step": 8291000 + }, + { + "epoch": 2.577533507313866, + "grad_norm": 14.557901382446289, + "learning_rate": 7.041108211435568e-06, + "loss": 2.5843, + "step": 8291500 + }, + { + "epoch": 2.5776889395943527, + "grad_norm": 11.437212944030762, + "learning_rate": 7.038517673427452e-06, + "loss": 2.5889, + "step": 8292000 + }, + { + "epoch": 2.5778443718748396, + "grad_norm": 10.026397705078125, + "learning_rate": 7.035927135419339e-06, + "loss": 2.5838, + "step": 8292500 + }, + { + "epoch": 2.5779998041553265, + "grad_norm": 9.948660850524902, + "learning_rate": 7.033336597411223e-06, + "loss": 2.5447, + "step": 8293000 + }, + { + "epoch": 2.5781552364358133, + "grad_norm": 10.054330825805664, + "learning_rate": 7.0307460594031095e-06, + "loss": 2.6002, + "step": 8293500 + }, + { + "epoch": 2.5783106687163, + "grad_norm": 9.093240737915039, + "learning_rate": 7.028155521394994e-06, + "loss": 2.5981, + "step": 8294000 + }, + { + "epoch": 2.578466100996787, + "grad_norm": 9.032247543334961, + "learning_rate": 7.02556498338688e-06, + "loss": 2.5889, + "step": 8294500 + }, + { + "epoch": 2.578621533277274, + "grad_norm": 8.520862579345703, + "learning_rate": 7.022974445378766e-06, + "loss": 2.5826, + "step": 8295000 + }, + { + "epoch": 2.578776965557761, + "grad_norm": 10.087979316711426, + "learning_rate": 7.0203839073706505e-06, + "loss": 2.5769, + "step": 8295500 + }, + { + "epoch": 2.5789323978382477, + "grad_norm": 9.179656028747559, + "learning_rate": 7.017793369362537e-06, + "loss": 2.5533, + "step": 8296000 + }, + { + "epoch": 2.579087830118735, + "grad_norm": 16.345787048339844, + "learning_rate": 7.0152028313544214e-06, + "loss": 2.6124, + "step": 8296500 + }, + { + "epoch": 2.5792432623992214, + "grad_norm": 9.872117042541504, + "learning_rate": 7.012612293346308e-06, + "loss": 2.587, + "step": 8297000 + }, + { + "epoch": 2.5793986946797087, + "grad_norm": 10.422450065612793, + "learning_rate": 7.010021755338192e-06, + "loss": 2.5846, + "step": 8297500 + }, + { + "epoch": 2.579554126960195, + "grad_norm": 9.92330551147461, + "learning_rate": 7.007431217330078e-06, + "loss": 2.5668, + "step": 8298000 + }, + { + "epoch": 2.5797095592406825, + "grad_norm": 12.090543746948242, + "learning_rate": 7.004840679321964e-06, + "loss": 2.6043, + "step": 8298500 + }, + { + "epoch": 2.579864991521169, + "grad_norm": 8.73303508758545, + "learning_rate": 7.002250141313849e-06, + "loss": 2.6091, + "step": 8299000 + }, + { + "epoch": 2.580020423801656, + "grad_norm": 10.007851600646973, + "learning_rate": 6.999659603305735e-06, + "loss": 2.599, + "step": 8299500 + }, + { + "epoch": 2.5801758560821426, + "grad_norm": 8.832229614257812, + "learning_rate": 6.99706906529762e-06, + "loss": 2.5843, + "step": 8300000 + }, + { + "epoch": 2.58033128836263, + "grad_norm": 9.343947410583496, + "learning_rate": 6.994478527289505e-06, + "loss": 2.5745, + "step": 8300500 + }, + { + "epoch": 2.5804867206431163, + "grad_norm": 27.554018020629883, + "learning_rate": 6.99188798928139e-06, + "loss": 2.5919, + "step": 8301000 + }, + { + "epoch": 2.5806421529236037, + "grad_norm": 11.930425643920898, + "learning_rate": 6.989297451273276e-06, + "loss": 2.5932, + "step": 8301500 + }, + { + "epoch": 2.58079758520409, + "grad_norm": 9.62363338470459, + "learning_rate": 6.986706913265161e-06, + "loss": 2.6227, + "step": 8302000 + }, + { + "epoch": 2.5809530174845774, + "grad_norm": 9.418720245361328, + "learning_rate": 6.984116375257047e-06, + "loss": 2.5822, + "step": 8302500 + }, + { + "epoch": 2.5811084497650643, + "grad_norm": 16.255455017089844, + "learning_rate": 6.981525837248932e-06, + "loss": 2.5916, + "step": 8303000 + }, + { + "epoch": 2.581263882045551, + "grad_norm": 10.489643096923828, + "learning_rate": 6.978935299240817e-06, + "loss": 2.5487, + "step": 8303500 + }, + { + "epoch": 2.581419314326038, + "grad_norm": 9.844016075134277, + "learning_rate": 6.976344761232703e-06, + "loss": 2.6109, + "step": 8304000 + }, + { + "epoch": 2.581574746606525, + "grad_norm": 16.288799285888672, + "learning_rate": 6.973754223224588e-06, + "loss": 2.5588, + "step": 8304500 + }, + { + "epoch": 2.5817301788870117, + "grad_norm": 6.2057719230651855, + "learning_rate": 6.971163685216474e-06, + "loss": 2.6177, + "step": 8305000 + }, + { + "epoch": 2.5818856111674986, + "grad_norm": 9.82999038696289, + "learning_rate": 6.968573147208359e-06, + "loss": 2.6045, + "step": 8305500 + }, + { + "epoch": 2.5820410434479855, + "grad_norm": 9.672236442565918, + "learning_rate": 6.965982609200244e-06, + "loss": 2.6125, + "step": 8306000 + }, + { + "epoch": 2.5821964757284723, + "grad_norm": 9.396002769470215, + "learning_rate": 6.963392071192129e-06, + "loss": 2.5952, + "step": 8306500 + }, + { + "epoch": 2.582351908008959, + "grad_norm": 7.3333845138549805, + "learning_rate": 6.960801533184015e-06, + "loss": 2.5991, + "step": 8307000 + }, + { + "epoch": 2.582507340289446, + "grad_norm": 54.44233322143555, + "learning_rate": 6.9582109951759015e-06, + "loss": 2.6316, + "step": 8307500 + }, + { + "epoch": 2.582662772569933, + "grad_norm": 9.087851524353027, + "learning_rate": 6.955620457167786e-06, + "loss": 2.6529, + "step": 8308000 + }, + { + "epoch": 2.58281820485042, + "grad_norm": 10.7346830368042, + "learning_rate": 6.953029919159672e-06, + "loss": 2.5922, + "step": 8308500 + }, + { + "epoch": 2.5829736371309067, + "grad_norm": 20.5643310546875, + "learning_rate": 6.950439381151556e-06, + "loss": 2.6287, + "step": 8309000 + }, + { + "epoch": 2.5831290694113935, + "grad_norm": 9.707176208496094, + "learning_rate": 6.9478488431434425e-06, + "loss": 2.5926, + "step": 8309500 + }, + { + "epoch": 2.5832845016918804, + "grad_norm": 11.646638870239258, + "learning_rate": 6.945258305135327e-06, + "loss": 2.5466, + "step": 8310000 + }, + { + "epoch": 2.5834399339723673, + "grad_norm": 11.626461029052734, + "learning_rate": 6.9426677671272134e-06, + "loss": 2.6241, + "step": 8310500 + }, + { + "epoch": 2.583595366252854, + "grad_norm": 14.87472915649414, + "learning_rate": 6.940077229119098e-06, + "loss": 2.6224, + "step": 8311000 + }, + { + "epoch": 2.583750798533341, + "grad_norm": 12.703420639038086, + "learning_rate": 6.9374866911109835e-06, + "loss": 2.5845, + "step": 8311500 + }, + { + "epoch": 2.583906230813828, + "grad_norm": 17.051435470581055, + "learning_rate": 6.93489615310287e-06, + "loss": 2.6347, + "step": 8312000 + }, + { + "epoch": 2.5840616630943147, + "grad_norm": 11.45814323425293, + "learning_rate": 6.9323056150947544e-06, + "loss": 2.6392, + "step": 8312500 + }, + { + "epoch": 2.5842170953748016, + "grad_norm": 47.17498779296875, + "learning_rate": 6.929715077086641e-06, + "loss": 2.5705, + "step": 8313000 + }, + { + "epoch": 2.5843725276552885, + "grad_norm": 17.800701141357422, + "learning_rate": 6.927124539078525e-06, + "loss": 2.622, + "step": 8313500 + }, + { + "epoch": 2.5845279599357753, + "grad_norm": 17.338581085205078, + "learning_rate": 6.924534001070411e-06, + "loss": 2.5896, + "step": 8314000 + }, + { + "epoch": 2.584683392216262, + "grad_norm": 9.754693984985352, + "learning_rate": 6.9219434630622954e-06, + "loss": 2.5788, + "step": 8314500 + }, + { + "epoch": 2.584838824496749, + "grad_norm": 8.706246376037598, + "learning_rate": 6.919352925054182e-06, + "loss": 2.5864, + "step": 8315000 + }, + { + "epoch": 2.584994256777236, + "grad_norm": 9.49280834197998, + "learning_rate": 6.916762387046066e-06, + "loss": 2.6305, + "step": 8315500 + }, + { + "epoch": 2.585149689057723, + "grad_norm": 8.880138397216797, + "learning_rate": 6.914171849037953e-06, + "loss": 2.5488, + "step": 8316000 + }, + { + "epoch": 2.5853051213382097, + "grad_norm": 10.97614860534668, + "learning_rate": 6.911581311029838e-06, + "loss": 2.61, + "step": 8316500 + }, + { + "epoch": 2.5854605536186965, + "grad_norm": 8.639646530151367, + "learning_rate": 6.908990773021723e-06, + "loss": 2.5976, + "step": 8317000 + }, + { + "epoch": 2.5856159858991834, + "grad_norm": 11.299515724182129, + "learning_rate": 6.906400235013609e-06, + "loss": 2.5932, + "step": 8317500 + }, + { + "epoch": 2.5857714181796703, + "grad_norm": 8.987006187438965, + "learning_rate": 6.903809697005494e-06, + "loss": 2.597, + "step": 8318000 + }, + { + "epoch": 2.585926850460157, + "grad_norm": 10.37508487701416, + "learning_rate": 6.90121915899738e-06, + "loss": 2.6178, + "step": 8318500 + }, + { + "epoch": 2.586082282740644, + "grad_norm": 25.23549461364746, + "learning_rate": 6.8986286209892646e-06, + "loss": 2.6466, + "step": 8319000 + }, + { + "epoch": 2.586237715021131, + "grad_norm": 9.512983322143555, + "learning_rate": 6.89603808298115e-06, + "loss": 2.5845, + "step": 8319500 + }, + { + "epoch": 2.5863931473016177, + "grad_norm": 9.27951717376709, + "learning_rate": 6.893447544973035e-06, + "loss": 2.5551, + "step": 8320000 + }, + { + "epoch": 2.5865485795821046, + "grad_norm": 17.66098403930664, + "learning_rate": 6.890857006964921e-06, + "loss": 2.6007, + "step": 8320500 + }, + { + "epoch": 2.586704011862592, + "grad_norm": 9.058838844299316, + "learning_rate": 6.888266468956807e-06, + "loss": 2.618, + "step": 8321000 + }, + { + "epoch": 2.5868594441430783, + "grad_norm": 90.53553009033203, + "learning_rate": 6.885675930948692e-06, + "loss": 2.5811, + "step": 8321500 + }, + { + "epoch": 2.5870148764235656, + "grad_norm": 11.170099258422852, + "learning_rate": 6.883085392940577e-06, + "loss": 2.5583, + "step": 8322000 + }, + { + "epoch": 2.587170308704052, + "grad_norm": 10.141180038452148, + "learning_rate": 6.880494854932462e-06, + "loss": 2.5671, + "step": 8322500 + }, + { + "epoch": 2.5873257409845394, + "grad_norm": 9.511003494262695, + "learning_rate": 6.877904316924348e-06, + "loss": 2.5945, + "step": 8323000 + }, + { + "epoch": 2.587481173265026, + "grad_norm": 9.910348892211914, + "learning_rate": 6.875313778916233e-06, + "loss": 2.5772, + "step": 8323500 + }, + { + "epoch": 2.587636605545513, + "grad_norm": 9.218727111816406, + "learning_rate": 6.872723240908119e-06, + "loss": 2.5954, + "step": 8324000 + }, + { + "epoch": 2.5877920378259995, + "grad_norm": 9.558538436889648, + "learning_rate": 6.870132702900004e-06, + "loss": 2.5776, + "step": 8324500 + }, + { + "epoch": 2.587947470106487, + "grad_norm": 11.266879081726074, + "learning_rate": 6.867542164891889e-06, + "loss": 2.5984, + "step": 8325000 + }, + { + "epoch": 2.5881029023869733, + "grad_norm": 10.557374000549316, + "learning_rate": 6.8649516268837755e-06, + "loss": 2.5753, + "step": 8325500 + }, + { + "epoch": 2.5882583346674606, + "grad_norm": 9.529340744018555, + "learning_rate": 6.86236108887566e-06, + "loss": 2.5483, + "step": 8326000 + }, + { + "epoch": 2.5884137669479474, + "grad_norm": 13.944048881530762, + "learning_rate": 6.8597705508675464e-06, + "loss": 2.5853, + "step": 8326500 + }, + { + "epoch": 2.5885691992284343, + "grad_norm": 9.048884391784668, + "learning_rate": 6.857180012859431e-06, + "loss": 2.537, + "step": 8327000 + }, + { + "epoch": 2.588724631508921, + "grad_norm": 15.275006294250488, + "learning_rate": 6.8545894748513165e-06, + "loss": 2.6546, + "step": 8327500 + }, + { + "epoch": 2.588880063789408, + "grad_norm": 9.759896278381348, + "learning_rate": 6.851998936843201e-06, + "loss": 2.5685, + "step": 8328000 + }, + { + "epoch": 2.589035496069895, + "grad_norm": 10.632315635681152, + "learning_rate": 6.8494083988350874e-06, + "loss": 2.6052, + "step": 8328500 + }, + { + "epoch": 2.589190928350382, + "grad_norm": 9.3245210647583, + "learning_rate": 6.846817860826972e-06, + "loss": 2.581, + "step": 8329000 + }, + { + "epoch": 2.5893463606308686, + "grad_norm": 11.946053504943848, + "learning_rate": 6.844227322818858e-06, + "loss": 2.5996, + "step": 8329500 + }, + { + "epoch": 2.5895017929113555, + "grad_norm": 10.663887977600098, + "learning_rate": 6.841636784810744e-06, + "loss": 2.6043, + "step": 8330000 + }, + { + "epoch": 2.5896572251918424, + "grad_norm": 9.43399429321289, + "learning_rate": 6.8390462468026284e-06, + "loss": 2.5519, + "step": 8330500 + }, + { + "epoch": 2.5898126574723292, + "grad_norm": 9.750828742980957, + "learning_rate": 6.836455708794515e-06, + "loss": 2.5986, + "step": 8331000 + }, + { + "epoch": 2.589968089752816, + "grad_norm": 10.485732078552246, + "learning_rate": 6.833865170786399e-06, + "loss": 2.5538, + "step": 8331500 + }, + { + "epoch": 2.590123522033303, + "grad_norm": 10.806584358215332, + "learning_rate": 6.831274632778286e-06, + "loss": 2.6225, + "step": 8332000 + }, + { + "epoch": 2.59027895431379, + "grad_norm": 19.89501953125, + "learning_rate": 6.82868409477017e-06, + "loss": 2.6095, + "step": 8332500 + }, + { + "epoch": 2.5904343865942767, + "grad_norm": 13.430764198303223, + "learning_rate": 6.826093556762056e-06, + "loss": 2.5924, + "step": 8333000 + }, + { + "epoch": 2.5905898188747636, + "grad_norm": 10.509170532226562, + "learning_rate": 6.82350301875394e-06, + "loss": 2.5976, + "step": 8333500 + }, + { + "epoch": 2.5907452511552505, + "grad_norm": 11.267539024353027, + "learning_rate": 6.820912480745827e-06, + "loss": 2.6253, + "step": 8334000 + }, + { + "epoch": 2.5909006834357373, + "grad_norm": 11.314038276672363, + "learning_rate": 6.818321942737713e-06, + "loss": 2.5597, + "step": 8334500 + }, + { + "epoch": 2.591056115716224, + "grad_norm": 18.207826614379883, + "learning_rate": 6.8157314047295976e-06, + "loss": 2.5788, + "step": 8335000 + }, + { + "epoch": 2.591211547996711, + "grad_norm": 10.162575721740723, + "learning_rate": 6.813140866721483e-06, + "loss": 2.5787, + "step": 8335500 + }, + { + "epoch": 2.591366980277198, + "grad_norm": 10.942526817321777, + "learning_rate": 6.810550328713368e-06, + "loss": 2.6285, + "step": 8336000 + }, + { + "epoch": 2.591522412557685, + "grad_norm": 7.10249137878418, + "learning_rate": 6.807959790705254e-06, + "loss": 2.5505, + "step": 8336500 + }, + { + "epoch": 2.5916778448381717, + "grad_norm": 8.969525337219238, + "learning_rate": 6.8053692526971386e-06, + "loss": 2.5935, + "step": 8337000 + }, + { + "epoch": 2.5918332771186585, + "grad_norm": 18.580455780029297, + "learning_rate": 6.802778714689025e-06, + "loss": 2.592, + "step": 8337500 + }, + { + "epoch": 2.5919887093991454, + "grad_norm": 12.846304893493652, + "learning_rate": 6.8001881766809095e-06, + "loss": 2.5627, + "step": 8338000 + }, + { + "epoch": 2.5921441416796323, + "grad_norm": 8.942526817321777, + "learning_rate": 6.797597638672795e-06, + "loss": 2.6281, + "step": 8338500 + }, + { + "epoch": 2.592299573960119, + "grad_norm": 9.430195808410645, + "learning_rate": 6.795007100664681e-06, + "loss": 2.5601, + "step": 8339000 + }, + { + "epoch": 2.592455006240606, + "grad_norm": 11.395645141601562, + "learning_rate": 6.792416562656566e-06, + "loss": 2.5661, + "step": 8339500 + }, + { + "epoch": 2.592610438521093, + "grad_norm": 13.799983978271484, + "learning_rate": 6.789826024648452e-06, + "loss": 2.6653, + "step": 8340000 + }, + { + "epoch": 2.5927658708015797, + "grad_norm": 10.882475852966309, + "learning_rate": 6.787235486640337e-06, + "loss": 2.5956, + "step": 8340500 + }, + { + "epoch": 2.5929213030820666, + "grad_norm": 12.094482421875, + "learning_rate": 6.784644948632222e-06, + "loss": 2.5662, + "step": 8341000 + }, + { + "epoch": 2.5930767353625535, + "grad_norm": 9.293008804321289, + "learning_rate": 6.782054410624107e-06, + "loss": 2.57, + "step": 8341500 + }, + { + "epoch": 2.5932321676430403, + "grad_norm": 10.056183815002441, + "learning_rate": 6.779463872615993e-06, + "loss": 2.6073, + "step": 8342000 + }, + { + "epoch": 2.593387599923527, + "grad_norm": 11.357943534851074, + "learning_rate": 6.776873334607878e-06, + "loss": 2.537, + "step": 8342500 + }, + { + "epoch": 2.593543032204014, + "grad_norm": 10.707761764526367, + "learning_rate": 6.774282796599764e-06, + "loss": 2.5979, + "step": 8343000 + }, + { + "epoch": 2.593698464484501, + "grad_norm": 9.237019538879395, + "learning_rate": 6.7716922585916495e-06, + "loss": 2.6134, + "step": 8343500 + }, + { + "epoch": 2.593853896764988, + "grad_norm": 15.089059829711914, + "learning_rate": 6.769101720583534e-06, + "loss": 2.617, + "step": 8344000 + }, + { + "epoch": 2.594009329045475, + "grad_norm": 10.547201156616211, + "learning_rate": 6.7665111825754204e-06, + "loss": 2.5965, + "step": 8344500 + }, + { + "epoch": 2.5941647613259615, + "grad_norm": 46.69375991821289, + "learning_rate": 6.763920644567305e-06, + "loss": 2.6155, + "step": 8345000 + }, + { + "epoch": 2.594320193606449, + "grad_norm": 13.60830307006836, + "learning_rate": 6.761330106559191e-06, + "loss": 2.6058, + "step": 8345500 + }, + { + "epoch": 2.5944756258869353, + "grad_norm": 11.795572280883789, + "learning_rate": 6.758739568551076e-06, + "loss": 2.6175, + "step": 8346000 + }, + { + "epoch": 2.5946310581674226, + "grad_norm": 22.83013343811035, + "learning_rate": 6.7561490305429614e-06, + "loss": 2.595, + "step": 8346500 + }, + { + "epoch": 2.594786490447909, + "grad_norm": 9.366497993469238, + "learning_rate": 6.753558492534846e-06, + "loss": 2.5922, + "step": 8347000 + }, + { + "epoch": 2.5949419227283963, + "grad_norm": 12.137068748474121, + "learning_rate": 6.750967954526732e-06, + "loss": 2.5842, + "step": 8347500 + }, + { + "epoch": 2.5950973550088827, + "grad_norm": 14.747393608093262, + "learning_rate": 6.748377416518619e-06, + "loss": 2.6169, + "step": 8348000 + }, + { + "epoch": 2.59525278728937, + "grad_norm": 11.142578125, + "learning_rate": 6.745786878510503e-06, + "loss": 2.6046, + "step": 8348500 + }, + { + "epoch": 2.5954082195698565, + "grad_norm": 10.787734031677246, + "learning_rate": 6.743196340502389e-06, + "loss": 2.5721, + "step": 8349000 + }, + { + "epoch": 2.5955636518503438, + "grad_norm": 8.767266273498535, + "learning_rate": 6.740605802494273e-06, + "loss": 2.6687, + "step": 8349500 + }, + { + "epoch": 2.59571908413083, + "grad_norm": 11.858742713928223, + "learning_rate": 6.73801526448616e-06, + "loss": 2.6149, + "step": 8350000 + }, + { + "epoch": 2.5958745164113175, + "grad_norm": 8.325481414794922, + "learning_rate": 6.735424726478044e-06, + "loss": 2.605, + "step": 8350500 + }, + { + "epoch": 2.5960299486918044, + "grad_norm": 13.410625457763672, + "learning_rate": 6.7328341884699306e-06, + "loss": 2.616, + "step": 8351000 + }, + { + "epoch": 2.5961853809722912, + "grad_norm": 13.668091773986816, + "learning_rate": 6.730243650461815e-06, + "loss": 2.6166, + "step": 8351500 + }, + { + "epoch": 2.596340813252778, + "grad_norm": 10.062768936157227, + "learning_rate": 6.727653112453701e-06, + "loss": 2.5942, + "step": 8352000 + }, + { + "epoch": 2.596496245533265, + "grad_norm": 11.082050323486328, + "learning_rate": 6.725062574445587e-06, + "loss": 2.6079, + "step": 8352500 + }, + { + "epoch": 2.596651677813752, + "grad_norm": 10.148900032043457, + "learning_rate": 6.7224720364374716e-06, + "loss": 2.608, + "step": 8353000 + }, + { + "epoch": 2.5968071100942387, + "grad_norm": 12.704815864562988, + "learning_rate": 6.719881498429358e-06, + "loss": 2.5424, + "step": 8353500 + }, + { + "epoch": 2.5969625423747256, + "grad_norm": 13.790635108947754, + "learning_rate": 6.7172909604212425e-06, + "loss": 2.5561, + "step": 8354000 + }, + { + "epoch": 2.5971179746552124, + "grad_norm": 10.955669403076172, + "learning_rate": 6.714700422413128e-06, + "loss": 2.5509, + "step": 8354500 + }, + { + "epoch": 2.5972734069356993, + "grad_norm": 10.361227989196777, + "learning_rate": 6.7121098844050125e-06, + "loss": 2.6049, + "step": 8355000 + }, + { + "epoch": 2.597428839216186, + "grad_norm": 8.200838088989258, + "learning_rate": 6.709519346396899e-06, + "loss": 2.6069, + "step": 8355500 + }, + { + "epoch": 2.597584271496673, + "grad_norm": 10.339225769042969, + "learning_rate": 6.7069288083887835e-06, + "loss": 2.5877, + "step": 8356000 + }, + { + "epoch": 2.59773970377716, + "grad_norm": 12.039509773254395, + "learning_rate": 6.70433827038067e-06, + "loss": 2.5732, + "step": 8356500 + }, + { + "epoch": 2.5978951360576468, + "grad_norm": 9.196782112121582, + "learning_rate": 6.701747732372555e-06, + "loss": 2.5982, + "step": 8357000 + }, + { + "epoch": 2.5980505683381336, + "grad_norm": 9.194636344909668, + "learning_rate": 6.69915719436444e-06, + "loss": 2.5833, + "step": 8357500 + }, + { + "epoch": 2.5982060006186205, + "grad_norm": 11.188305854797363, + "learning_rate": 6.696566656356326e-06, + "loss": 2.6027, + "step": 8358000 + }, + { + "epoch": 2.5983614328991074, + "grad_norm": 9.399676322937012, + "learning_rate": 6.693976118348211e-06, + "loss": 2.6107, + "step": 8358500 + }, + { + "epoch": 2.5985168651795942, + "grad_norm": 19.11429214477539, + "learning_rate": 6.691385580340097e-06, + "loss": 2.5866, + "step": 8359000 + }, + { + "epoch": 2.598672297460081, + "grad_norm": 13.55917739868164, + "learning_rate": 6.688795042331982e-06, + "loss": 2.5918, + "step": 8359500 + }, + { + "epoch": 2.598827729740568, + "grad_norm": 8.853660583496094, + "learning_rate": 6.686204504323867e-06, + "loss": 2.6091, + "step": 8360000 + }, + { + "epoch": 2.598983162021055, + "grad_norm": 11.86072063446045, + "learning_rate": 6.683613966315752e-06, + "loss": 2.6115, + "step": 8360500 + }, + { + "epoch": 2.5991385943015417, + "grad_norm": 17.3917179107666, + "learning_rate": 6.681023428307638e-06, + "loss": 2.6228, + "step": 8361000 + }, + { + "epoch": 2.5992940265820286, + "grad_norm": 9.676480293273926, + "learning_rate": 6.678432890299524e-06, + "loss": 2.5353, + "step": 8361500 + }, + { + "epoch": 2.5994494588625154, + "grad_norm": 10.49279499053955, + "learning_rate": 6.675842352291409e-06, + "loss": 2.5524, + "step": 8362000 + }, + { + "epoch": 2.5996048911430023, + "grad_norm": 8.536209106445312, + "learning_rate": 6.6732518142832944e-06, + "loss": 2.5841, + "step": 8362500 + }, + { + "epoch": 2.599760323423489, + "grad_norm": 8.624156951904297, + "learning_rate": 6.670661276275179e-06, + "loss": 2.6229, + "step": 8363000 + }, + { + "epoch": 2.599915755703976, + "grad_norm": 9.312881469726562, + "learning_rate": 6.668070738267065e-06, + "loss": 2.5956, + "step": 8363500 + }, + { + "epoch": 2.600071187984463, + "grad_norm": 9.953371047973633, + "learning_rate": 6.66548020025895e-06, + "loss": 2.601, + "step": 8364000 + }, + { + "epoch": 2.6002266202649498, + "grad_norm": 50.324317932128906, + "learning_rate": 6.662889662250836e-06, + "loss": 2.5721, + "step": 8364500 + }, + { + "epoch": 2.6003820525454366, + "grad_norm": 12.241674423217773, + "learning_rate": 6.660299124242721e-06, + "loss": 2.6541, + "step": 8365000 + }, + { + "epoch": 2.6005374848259235, + "grad_norm": 7.690042972564697, + "learning_rate": 6.657708586234606e-06, + "loss": 2.5818, + "step": 8365500 + }, + { + "epoch": 2.6006929171064104, + "grad_norm": 11.61663818359375, + "learning_rate": 6.655118048226493e-06, + "loss": 2.6149, + "step": 8366000 + }, + { + "epoch": 2.6008483493868972, + "grad_norm": 10.246222496032715, + "learning_rate": 6.652527510218377e-06, + "loss": 2.524, + "step": 8366500 + }, + { + "epoch": 2.601003781667384, + "grad_norm": 9.640286445617676, + "learning_rate": 6.6499369722102636e-06, + "loss": 2.5838, + "step": 8367000 + }, + { + "epoch": 2.601159213947871, + "grad_norm": 10.659416198730469, + "learning_rate": 6.647346434202148e-06, + "loss": 2.5728, + "step": 8367500 + }, + { + "epoch": 2.601314646228358, + "grad_norm": 6.140579700469971, + "learning_rate": 6.644755896194034e-06, + "loss": 2.619, + "step": 8368000 + }, + { + "epoch": 2.6014700785088447, + "grad_norm": 10.569930076599121, + "learning_rate": 6.642165358185918e-06, + "loss": 2.5975, + "step": 8368500 + }, + { + "epoch": 2.601625510789332, + "grad_norm": 9.757020950317383, + "learning_rate": 6.6395748201778046e-06, + "loss": 2.6246, + "step": 8369000 + }, + { + "epoch": 2.6017809430698184, + "grad_norm": 11.591867446899414, + "learning_rate": 6.636984282169689e-06, + "loss": 2.5882, + "step": 8369500 + }, + { + "epoch": 2.6019363753503058, + "grad_norm": 9.355794906616211, + "learning_rate": 6.6343937441615755e-06, + "loss": 2.5927, + "step": 8370000 + }, + { + "epoch": 2.602091807630792, + "grad_norm": 10.923280715942383, + "learning_rate": 6.631803206153461e-06, + "loss": 2.6049, + "step": 8370500 + }, + { + "epoch": 2.6022472399112795, + "grad_norm": 10.697514533996582, + "learning_rate": 6.6292126681453456e-06, + "loss": 2.6211, + "step": 8371000 + }, + { + "epoch": 2.602402672191766, + "grad_norm": 10.127492904663086, + "learning_rate": 6.626622130137232e-06, + "loss": 2.612, + "step": 8371500 + }, + { + "epoch": 2.602558104472253, + "grad_norm": 8.74005126953125, + "learning_rate": 6.6240315921291165e-06, + "loss": 2.5375, + "step": 8372000 + }, + { + "epoch": 2.6027135367527396, + "grad_norm": 10.864766120910645, + "learning_rate": 6.621441054121003e-06, + "loss": 2.5834, + "step": 8372500 + }, + { + "epoch": 2.602868969033227, + "grad_norm": 9.929703712463379, + "learning_rate": 6.618850516112887e-06, + "loss": 2.5861, + "step": 8373000 + }, + { + "epoch": 2.6030244013137134, + "grad_norm": 11.124922752380371, + "learning_rate": 6.616259978104773e-06, + "loss": 2.5926, + "step": 8373500 + }, + { + "epoch": 2.6031798335942007, + "grad_norm": 10.467906951904297, + "learning_rate": 6.6136694400966575e-06, + "loss": 2.5932, + "step": 8374000 + }, + { + "epoch": 2.603335265874687, + "grad_norm": 27.644073486328125, + "learning_rate": 6.611078902088544e-06, + "loss": 2.5621, + "step": 8374500 + }, + { + "epoch": 2.6034906981551744, + "grad_norm": 10.10387134552002, + "learning_rate": 6.60848836408043e-06, + "loss": 2.5964, + "step": 8375000 + }, + { + "epoch": 2.6036461304356613, + "grad_norm": 10.539552688598633, + "learning_rate": 6.605897826072315e-06, + "loss": 2.6041, + "step": 8375500 + }, + { + "epoch": 2.603801562716148, + "grad_norm": 11.200972557067871, + "learning_rate": 6.6033072880642e-06, + "loss": 2.5622, + "step": 8376000 + }, + { + "epoch": 2.603956994996635, + "grad_norm": 8.863504409790039, + "learning_rate": 6.600716750056085e-06, + "loss": 2.6008, + "step": 8376500 + }, + { + "epoch": 2.604112427277122, + "grad_norm": 10.339340209960938, + "learning_rate": 6.598126212047971e-06, + "loss": 2.6184, + "step": 8377000 + }, + { + "epoch": 2.6042678595576088, + "grad_norm": 13.617816925048828, + "learning_rate": 6.595535674039856e-06, + "loss": 2.6123, + "step": 8377500 + }, + { + "epoch": 2.6044232918380956, + "grad_norm": 9.908581733703613, + "learning_rate": 6.592945136031742e-06, + "loss": 2.5785, + "step": 8378000 + }, + { + "epoch": 2.6045787241185825, + "grad_norm": 10.208206176757812, + "learning_rate": 6.590354598023627e-06, + "loss": 2.6163, + "step": 8378500 + }, + { + "epoch": 2.6047341563990694, + "grad_norm": 11.449905395507812, + "learning_rate": 6.587764060015512e-06, + "loss": 2.5699, + "step": 8379000 + }, + { + "epoch": 2.6048895886795562, + "grad_norm": 12.248048782348633, + "learning_rate": 6.585173522007398e-06, + "loss": 2.6359, + "step": 8379500 + }, + { + "epoch": 2.605045020960043, + "grad_norm": 10.568693161010742, + "learning_rate": 6.582582983999283e-06, + "loss": 2.579, + "step": 8380000 + }, + { + "epoch": 2.60520045324053, + "grad_norm": 11.789876937866211, + "learning_rate": 6.579992445991169e-06, + "loss": 2.5819, + "step": 8380500 + }, + { + "epoch": 2.605355885521017, + "grad_norm": 7.004114151000977, + "learning_rate": 6.577401907983054e-06, + "loss": 2.542, + "step": 8381000 + }, + { + "epoch": 2.6055113178015037, + "grad_norm": 13.963423728942871, + "learning_rate": 6.574811369974939e-06, + "loss": 2.5844, + "step": 8381500 + }, + { + "epoch": 2.6056667500819906, + "grad_norm": 9.108633041381836, + "learning_rate": 6.572220831966824e-06, + "loss": 2.6107, + "step": 8382000 + }, + { + "epoch": 2.6058221823624774, + "grad_norm": 10.53232192993164, + "learning_rate": 6.56963029395871e-06, + "loss": 2.584, + "step": 8382500 + }, + { + "epoch": 2.6059776146429643, + "grad_norm": 12.91695785522461, + "learning_rate": 6.567039755950595e-06, + "loss": 2.6303, + "step": 8383000 + }, + { + "epoch": 2.606133046923451, + "grad_norm": 13.564339637756348, + "learning_rate": 6.564449217942481e-06, + "loss": 2.5953, + "step": 8383500 + }, + { + "epoch": 2.606288479203938, + "grad_norm": 21.767684936523438, + "learning_rate": 6.561858679934367e-06, + "loss": 2.5858, + "step": 8384000 + }, + { + "epoch": 2.606443911484425, + "grad_norm": 10.736268043518066, + "learning_rate": 6.559268141926251e-06, + "loss": 2.6102, + "step": 8384500 + }, + { + "epoch": 2.6065993437649118, + "grad_norm": 9.60566234588623, + "learning_rate": 6.5566776039181376e-06, + "loss": 2.6014, + "step": 8385000 + }, + { + "epoch": 2.6067547760453986, + "grad_norm": 13.146986961364746, + "learning_rate": 6.554087065910022e-06, + "loss": 2.5857, + "step": 8385500 + }, + { + "epoch": 2.6069102083258855, + "grad_norm": 9.540262222290039, + "learning_rate": 6.5514965279019085e-06, + "loss": 2.6067, + "step": 8386000 + }, + { + "epoch": 2.6070656406063724, + "grad_norm": 10.08193302154541, + "learning_rate": 6.548905989893793e-06, + "loss": 2.5787, + "step": 8386500 + }, + { + "epoch": 2.6072210728868592, + "grad_norm": 7.749365329742432, + "learning_rate": 6.5463154518856786e-06, + "loss": 2.5527, + "step": 8387000 + }, + { + "epoch": 2.607376505167346, + "grad_norm": 13.66150951385498, + "learning_rate": 6.543724913877563e-06, + "loss": 2.572, + "step": 8387500 + }, + { + "epoch": 2.607531937447833, + "grad_norm": 10.577105522155762, + "learning_rate": 6.5411343758694495e-06, + "loss": 2.5639, + "step": 8388000 + }, + { + "epoch": 2.60768736972832, + "grad_norm": 11.114225387573242, + "learning_rate": 6.538543837861336e-06, + "loss": 2.5763, + "step": 8388500 + }, + { + "epoch": 2.6078428020088067, + "grad_norm": 9.497424125671387, + "learning_rate": 6.53595329985322e-06, + "loss": 2.5354, + "step": 8389000 + }, + { + "epoch": 2.6079982342892936, + "grad_norm": 12.207103729248047, + "learning_rate": 6.533362761845106e-06, + "loss": 2.6148, + "step": 8389500 + }, + { + "epoch": 2.6081536665697804, + "grad_norm": 11.513703346252441, + "learning_rate": 6.530772223836991e-06, + "loss": 2.6218, + "step": 8390000 + }, + { + "epoch": 2.6083090988502673, + "grad_norm": 10.8588285446167, + "learning_rate": 6.528181685828877e-06, + "loss": 2.5986, + "step": 8390500 + }, + { + "epoch": 2.608464531130754, + "grad_norm": 11.053077697753906, + "learning_rate": 6.525591147820761e-06, + "loss": 2.6133, + "step": 8391000 + }, + { + "epoch": 2.608619963411241, + "grad_norm": 10.52961254119873, + "learning_rate": 6.523000609812648e-06, + "loss": 2.6154, + "step": 8391500 + }, + { + "epoch": 2.608775395691728, + "grad_norm": 11.34431266784668, + "learning_rate": 6.520410071804532e-06, + "loss": 2.5579, + "step": 8392000 + }, + { + "epoch": 2.6089308279722148, + "grad_norm": 7.085823059082031, + "learning_rate": 6.517819533796419e-06, + "loss": 2.6193, + "step": 8392500 + }, + { + "epoch": 2.6090862602527016, + "grad_norm": 11.020320892333984, + "learning_rate": 6.515228995788304e-06, + "loss": 2.564, + "step": 8393000 + }, + { + "epoch": 2.609241692533189, + "grad_norm": 7.240298748016357, + "learning_rate": 6.512638457780189e-06, + "loss": 2.5804, + "step": 8393500 + }, + { + "epoch": 2.6093971248136754, + "grad_norm": 8.603004455566406, + "learning_rate": 6.510047919772075e-06, + "loss": 2.6007, + "step": 8394000 + }, + { + "epoch": 2.6095525570941627, + "grad_norm": 9.986251831054688, + "learning_rate": 6.50745738176396e-06, + "loss": 2.5548, + "step": 8394500 + }, + { + "epoch": 2.609707989374649, + "grad_norm": 12.084172248840332, + "learning_rate": 6.504866843755846e-06, + "loss": 2.6161, + "step": 8395000 + }, + { + "epoch": 2.6098634216551364, + "grad_norm": 9.474181175231934, + "learning_rate": 6.5022763057477305e-06, + "loss": 2.5949, + "step": 8395500 + }, + { + "epoch": 2.610018853935623, + "grad_norm": 14.272550582885742, + "learning_rate": 6.499685767739616e-06, + "loss": 2.5478, + "step": 8396000 + }, + { + "epoch": 2.61017428621611, + "grad_norm": 7.27834415435791, + "learning_rate": 6.497095229731501e-06, + "loss": 2.558, + "step": 8396500 + }, + { + "epoch": 2.6103297184965966, + "grad_norm": 8.378321647644043, + "learning_rate": 6.494504691723387e-06, + "loss": 2.5947, + "step": 8397000 + }, + { + "epoch": 2.610485150777084, + "grad_norm": 34.68902587890625, + "learning_rate": 6.491914153715273e-06, + "loss": 2.5952, + "step": 8397500 + }, + { + "epoch": 2.6106405830575703, + "grad_norm": 9.925490379333496, + "learning_rate": 6.489323615707158e-06, + "loss": 2.6047, + "step": 8398000 + }, + { + "epoch": 2.6107960153380576, + "grad_norm": 10.797547340393066, + "learning_rate": 6.486733077699043e-06, + "loss": 2.5888, + "step": 8398500 + }, + { + "epoch": 2.6109514476185445, + "grad_norm": 16.506765365600586, + "learning_rate": 6.484142539690928e-06, + "loss": 2.5882, + "step": 8399000 + }, + { + "epoch": 2.6111068798990313, + "grad_norm": 25.010286331176758, + "learning_rate": 6.481552001682814e-06, + "loss": 2.5906, + "step": 8399500 + }, + { + "epoch": 2.611262312179518, + "grad_norm": 8.15039348602295, + "learning_rate": 6.478961463674699e-06, + "loss": 2.6316, + "step": 8400000 + }, + { + "epoch": 2.611417744460005, + "grad_norm": 10.17445182800293, + "learning_rate": 6.476370925666585e-06, + "loss": 2.6165, + "step": 8400500 + }, + { + "epoch": 2.611573176740492, + "grad_norm": 13.30849838256836, + "learning_rate": 6.47378038765847e-06, + "loss": 2.5913, + "step": 8401000 + }, + { + "epoch": 2.611728609020979, + "grad_norm": 13.566370010375977, + "learning_rate": 6.471189849650355e-06, + "loss": 2.6399, + "step": 8401500 + }, + { + "epoch": 2.6118840413014657, + "grad_norm": 8.7732515335083, + "learning_rate": 6.4685993116422415e-06, + "loss": 2.5868, + "step": 8402000 + }, + { + "epoch": 2.6120394735819525, + "grad_norm": 10.264923095703125, + "learning_rate": 6.466008773634126e-06, + "loss": 2.5796, + "step": 8402500 + }, + { + "epoch": 2.6121949058624394, + "grad_norm": 9.429855346679688, + "learning_rate": 6.463418235626012e-06, + "loss": 2.6283, + "step": 8403000 + }, + { + "epoch": 2.6123503381429263, + "grad_norm": 13.164348602294922, + "learning_rate": 6.460827697617897e-06, + "loss": 2.6583, + "step": 8403500 + }, + { + "epoch": 2.612505770423413, + "grad_norm": 12.69997501373291, + "learning_rate": 6.4582371596097825e-06, + "loss": 2.617, + "step": 8404000 + }, + { + "epoch": 2.6126612027039, + "grad_norm": 11.457562446594238, + "learning_rate": 6.455646621601667e-06, + "loss": 2.5288, + "step": 8404500 + }, + { + "epoch": 2.612816634984387, + "grad_norm": 6.983985900878906, + "learning_rate": 6.453056083593553e-06, + "loss": 2.5813, + "step": 8405000 + }, + { + "epoch": 2.6129720672648737, + "grad_norm": 13.196722984313965, + "learning_rate": 6.450465545585438e-06, + "loss": 2.6253, + "step": 8405500 + }, + { + "epoch": 2.6131274995453606, + "grad_norm": 10.016190528869629, + "learning_rate": 6.447875007577324e-06, + "loss": 2.5735, + "step": 8406000 + }, + { + "epoch": 2.6132829318258475, + "grad_norm": 9.114789962768555, + "learning_rate": 6.44528446956921e-06, + "loss": 2.6009, + "step": 8406500 + }, + { + "epoch": 2.6134383641063343, + "grad_norm": 8.905508041381836, + "learning_rate": 6.442693931561094e-06, + "loss": 2.6071, + "step": 8407000 + }, + { + "epoch": 2.613593796386821, + "grad_norm": 15.075413703918457, + "learning_rate": 6.440103393552981e-06, + "loss": 2.5894, + "step": 8407500 + }, + { + "epoch": 2.613749228667308, + "grad_norm": 9.962820053100586, + "learning_rate": 6.437512855544865e-06, + "loss": 2.6124, + "step": 8408000 + }, + { + "epoch": 2.613904660947795, + "grad_norm": 14.222315788269043, + "learning_rate": 6.434922317536752e-06, + "loss": 2.6305, + "step": 8408500 + }, + { + "epoch": 2.614060093228282, + "grad_norm": 9.724801063537598, + "learning_rate": 6.432331779528636e-06, + "loss": 2.5534, + "step": 8409000 + }, + { + "epoch": 2.6142155255087687, + "grad_norm": 28.06261444091797, + "learning_rate": 6.429741241520522e-06, + "loss": 2.6076, + "step": 8409500 + }, + { + "epoch": 2.6143709577892555, + "grad_norm": 11.377358436584473, + "learning_rate": 6.427150703512406e-06, + "loss": 2.5977, + "step": 8410000 + }, + { + "epoch": 2.6145263900697424, + "grad_norm": 8.833795547485352, + "learning_rate": 6.424560165504293e-06, + "loss": 2.5488, + "step": 8410500 + }, + { + "epoch": 2.6146818223502293, + "grad_norm": 8.138504028320312, + "learning_rate": 6.421969627496179e-06, + "loss": 2.5769, + "step": 8411000 + }, + { + "epoch": 2.614837254630716, + "grad_norm": 10.410497665405273, + "learning_rate": 6.4193790894880635e-06, + "loss": 2.5972, + "step": 8411500 + }, + { + "epoch": 2.614992686911203, + "grad_norm": 10.03703498840332, + "learning_rate": 6.416788551479949e-06, + "loss": 2.602, + "step": 8412000 + }, + { + "epoch": 2.61514811919169, + "grad_norm": 10.373275756835938, + "learning_rate": 6.414198013471834e-06, + "loss": 2.5961, + "step": 8412500 + }, + { + "epoch": 2.6153035514721767, + "grad_norm": 13.754950523376465, + "learning_rate": 6.41160747546372e-06, + "loss": 2.6075, + "step": 8413000 + }, + { + "epoch": 2.6154589837526636, + "grad_norm": 14.303474426269531, + "learning_rate": 6.4090169374556045e-06, + "loss": 2.5877, + "step": 8413500 + }, + { + "epoch": 2.6156144160331505, + "grad_norm": 7.811459541320801, + "learning_rate": 6.406426399447491e-06, + "loss": 2.625, + "step": 8414000 + }, + { + "epoch": 2.6157698483136373, + "grad_norm": 9.810944557189941, + "learning_rate": 6.4038358614393754e-06, + "loss": 2.6289, + "step": 8414500 + }, + { + "epoch": 2.615925280594124, + "grad_norm": 11.184921264648438, + "learning_rate": 6.401245323431261e-06, + "loss": 2.6064, + "step": 8415000 + }, + { + "epoch": 2.616080712874611, + "grad_norm": 10.074687957763672, + "learning_rate": 6.398654785423147e-06, + "loss": 2.5731, + "step": 8415500 + }, + { + "epoch": 2.616236145155098, + "grad_norm": 23.662378311157227, + "learning_rate": 6.396064247415032e-06, + "loss": 2.5949, + "step": 8416000 + }, + { + "epoch": 2.616391577435585, + "grad_norm": 10.395750045776367, + "learning_rate": 6.393473709406918e-06, + "loss": 2.6098, + "step": 8416500 + }, + { + "epoch": 2.616547009716072, + "grad_norm": 10.950582504272461, + "learning_rate": 6.390883171398803e-06, + "loss": 2.6076, + "step": 8417000 + }, + { + "epoch": 2.6167024419965585, + "grad_norm": 11.061418533325195, + "learning_rate": 6.388292633390688e-06, + "loss": 2.5991, + "step": 8417500 + }, + { + "epoch": 2.616857874277046, + "grad_norm": 9.991767883300781, + "learning_rate": 6.385702095382573e-06, + "loss": 2.566, + "step": 8418000 + }, + { + "epoch": 2.6170133065575323, + "grad_norm": 9.489648818969727, + "learning_rate": 6.383111557374459e-06, + "loss": 2.5751, + "step": 8418500 + }, + { + "epoch": 2.6171687388380196, + "grad_norm": 10.866886138916016, + "learning_rate": 6.380521019366344e-06, + "loss": 2.6209, + "step": 8419000 + }, + { + "epoch": 2.617324171118506, + "grad_norm": 9.404526710510254, + "learning_rate": 6.37793048135823e-06, + "loss": 2.6539, + "step": 8419500 + }, + { + "epoch": 2.6174796033989933, + "grad_norm": 17.9898624420166, + "learning_rate": 6.3753399433501155e-06, + "loss": 2.592, + "step": 8420000 + }, + { + "epoch": 2.6176350356794797, + "grad_norm": 8.687386512756348, + "learning_rate": 6.372749405342e-06, + "loss": 2.5693, + "step": 8420500 + }, + { + "epoch": 2.617790467959967, + "grad_norm": 9.813857078552246, + "learning_rate": 6.370158867333886e-06, + "loss": 2.6167, + "step": 8421000 + }, + { + "epoch": 2.6179459002404535, + "grad_norm": 8.758238792419434, + "learning_rate": 6.367568329325771e-06, + "loss": 2.547, + "step": 8421500 + }, + { + "epoch": 2.618101332520941, + "grad_norm": 8.670063018798828, + "learning_rate": 6.364977791317657e-06, + "loss": 2.6211, + "step": 8422000 + }, + { + "epoch": 2.618256764801427, + "grad_norm": 12.009956359863281, + "learning_rate": 6.362387253309542e-06, + "loss": 2.5864, + "step": 8422500 + }, + { + "epoch": 2.6184121970819145, + "grad_norm": 11.948580741882324, + "learning_rate": 6.359796715301427e-06, + "loss": 2.5845, + "step": 8423000 + }, + { + "epoch": 2.6185676293624014, + "grad_norm": 12.444252014160156, + "learning_rate": 6.357206177293312e-06, + "loss": 2.6342, + "step": 8423500 + }, + { + "epoch": 2.6187230616428883, + "grad_norm": 12.318422317504883, + "learning_rate": 6.354615639285198e-06, + "loss": 2.6423, + "step": 8424000 + }, + { + "epoch": 2.618878493923375, + "grad_norm": 9.142181396484375, + "learning_rate": 6.352025101277085e-06, + "loss": 2.5814, + "step": 8424500 + }, + { + "epoch": 2.619033926203862, + "grad_norm": 10.146476745605469, + "learning_rate": 6.349434563268969e-06, + "loss": 2.6124, + "step": 8425000 + }, + { + "epoch": 2.619189358484349, + "grad_norm": 10.64388370513916, + "learning_rate": 6.346844025260855e-06, + "loss": 2.5482, + "step": 8425500 + }, + { + "epoch": 2.6193447907648357, + "grad_norm": 12.226089477539062, + "learning_rate": 6.344253487252739e-06, + "loss": 2.5691, + "step": 8426000 + }, + { + "epoch": 2.6195002230453226, + "grad_norm": 9.482626914978027, + "learning_rate": 6.341662949244626e-06, + "loss": 2.576, + "step": 8426500 + }, + { + "epoch": 2.6196556553258095, + "grad_norm": 34.006195068359375, + "learning_rate": 6.33907241123651e-06, + "loss": 2.5674, + "step": 8427000 + }, + { + "epoch": 2.6198110876062963, + "grad_norm": 9.668498992919922, + "learning_rate": 6.3364818732283965e-06, + "loss": 2.57, + "step": 8427500 + }, + { + "epoch": 2.619966519886783, + "grad_norm": 10.905255317687988, + "learning_rate": 6.333891335220281e-06, + "loss": 2.5604, + "step": 8428000 + }, + { + "epoch": 2.62012195216727, + "grad_norm": 11.406230926513672, + "learning_rate": 6.331300797212167e-06, + "loss": 2.5791, + "step": 8428500 + }, + { + "epoch": 2.620277384447757, + "grad_norm": 9.11614990234375, + "learning_rate": 6.328710259204053e-06, + "loss": 2.5602, + "step": 8429000 + }, + { + "epoch": 2.620432816728244, + "grad_norm": 10.216358184814453, + "learning_rate": 6.3261197211959375e-06, + "loss": 2.5717, + "step": 8429500 + }, + { + "epoch": 2.6205882490087307, + "grad_norm": 8.862043380737305, + "learning_rate": 6.323529183187824e-06, + "loss": 2.596, + "step": 8430000 + }, + { + "epoch": 2.6207436812892175, + "grad_norm": 15.50439739227295, + "learning_rate": 6.3209386451797084e-06, + "loss": 2.5631, + "step": 8430500 + }, + { + "epoch": 2.6208991135697044, + "grad_norm": 11.683679580688477, + "learning_rate": 6.318348107171594e-06, + "loss": 2.6271, + "step": 8431000 + }, + { + "epoch": 2.6210545458501913, + "grad_norm": 15.505237579345703, + "learning_rate": 6.3157575691634785e-06, + "loss": 2.5763, + "step": 8431500 + }, + { + "epoch": 2.621209978130678, + "grad_norm": 10.834951400756836, + "learning_rate": 6.313167031155365e-06, + "loss": 2.587, + "step": 8432000 + }, + { + "epoch": 2.621365410411165, + "grad_norm": 20.808073043823242, + "learning_rate": 6.310576493147251e-06, + "loss": 2.6019, + "step": 8432500 + }, + { + "epoch": 2.621520842691652, + "grad_norm": 12.801848411560059, + "learning_rate": 6.307985955139136e-06, + "loss": 2.5874, + "step": 8433000 + }, + { + "epoch": 2.6216762749721387, + "grad_norm": 8.18498706817627, + "learning_rate": 6.305395417131021e-06, + "loss": 2.6005, + "step": 8433500 + }, + { + "epoch": 2.6218317072526256, + "grad_norm": 44.80241394042969, + "learning_rate": 6.302804879122906e-06, + "loss": 2.5996, + "step": 8434000 + }, + { + "epoch": 2.6219871395331125, + "grad_norm": 7.874608993530273, + "learning_rate": 6.300214341114792e-06, + "loss": 2.5878, + "step": 8434500 + }, + { + "epoch": 2.6221425718135993, + "grad_norm": 11.266244888305664, + "learning_rate": 6.297623803106677e-06, + "loss": 2.631, + "step": 8435000 + }, + { + "epoch": 2.622298004094086, + "grad_norm": 10.865387916564941, + "learning_rate": 6.295033265098563e-06, + "loss": 2.5731, + "step": 8435500 + }, + { + "epoch": 2.622453436374573, + "grad_norm": 9.454704284667969, + "learning_rate": 6.292442727090448e-06, + "loss": 2.6261, + "step": 8436000 + }, + { + "epoch": 2.62260886865506, + "grad_norm": 10.517948150634766, + "learning_rate": 6.289852189082333e-06, + "loss": 2.5949, + "step": 8436500 + }, + { + "epoch": 2.622764300935547, + "grad_norm": 12.372233390808105, + "learning_rate": 6.287261651074219e-06, + "loss": 2.5854, + "step": 8437000 + }, + { + "epoch": 2.6229197332160337, + "grad_norm": 9.184768676757812, + "learning_rate": 6.284671113066104e-06, + "loss": 2.5999, + "step": 8437500 + }, + { + "epoch": 2.6230751654965205, + "grad_norm": 13.043652534484863, + "learning_rate": 6.28208057505799e-06, + "loss": 2.6065, + "step": 8438000 + }, + { + "epoch": 2.6232305977770074, + "grad_norm": 9.583988189697266, + "learning_rate": 6.279490037049875e-06, + "loss": 2.6185, + "step": 8438500 + }, + { + "epoch": 2.6233860300574943, + "grad_norm": 11.953428268432617, + "learning_rate": 6.27689949904176e-06, + "loss": 2.5912, + "step": 8439000 + }, + { + "epoch": 2.623541462337981, + "grad_norm": 10.229671478271484, + "learning_rate": 6.274308961033645e-06, + "loss": 2.6139, + "step": 8439500 + }, + { + "epoch": 2.623696894618468, + "grad_norm": 9.618630409240723, + "learning_rate": 6.271718423025531e-06, + "loss": 2.5526, + "step": 8440000 + }, + { + "epoch": 2.623852326898955, + "grad_norm": 9.46522045135498, + "learning_rate": 6.269127885017416e-06, + "loss": 2.604, + "step": 8440500 + }, + { + "epoch": 2.6240077591794417, + "grad_norm": 9.787168502807617, + "learning_rate": 6.266537347009302e-06, + "loss": 2.5915, + "step": 8441000 + }, + { + "epoch": 2.624163191459929, + "grad_norm": 10.14344310760498, + "learning_rate": 6.263946809001188e-06, + "loss": 2.6428, + "step": 8441500 + }, + { + "epoch": 2.6243186237404155, + "grad_norm": 7.569236755371094, + "learning_rate": 6.261356270993072e-06, + "loss": 2.6414, + "step": 8442000 + }, + { + "epoch": 2.624474056020903, + "grad_norm": 13.479161262512207, + "learning_rate": 6.258765732984959e-06, + "loss": 2.6116, + "step": 8442500 + }, + { + "epoch": 2.624629488301389, + "grad_norm": 8.75983715057373, + "learning_rate": 6.256175194976843e-06, + "loss": 2.5708, + "step": 8443000 + }, + { + "epoch": 2.6247849205818765, + "grad_norm": 9.158310890197754, + "learning_rate": 6.2535846569687295e-06, + "loss": 2.5832, + "step": 8443500 + }, + { + "epoch": 2.624940352862363, + "grad_norm": 43.542259216308594, + "learning_rate": 6.250994118960614e-06, + "loss": 2.5919, + "step": 8444000 + }, + { + "epoch": 2.6250957851428502, + "grad_norm": 10.023472785949707, + "learning_rate": 6.2484035809525e-06, + "loss": 2.5708, + "step": 8444500 + }, + { + "epoch": 2.6252512174233367, + "grad_norm": 16.125520706176758, + "learning_rate": 6.245813042944385e-06, + "loss": 2.6019, + "step": 8445000 + }, + { + "epoch": 2.625406649703824, + "grad_norm": 10.00516128540039, + "learning_rate": 6.2432225049362705e-06, + "loss": 2.62, + "step": 8445500 + }, + { + "epoch": 2.6255620819843104, + "grad_norm": 11.446626663208008, + "learning_rate": 6.240631966928156e-06, + "loss": 2.5846, + "step": 8446000 + }, + { + "epoch": 2.6257175142647977, + "grad_norm": 12.93912410736084, + "learning_rate": 6.2380414289200414e-06, + "loss": 2.5585, + "step": 8446500 + }, + { + "epoch": 2.6258729465452846, + "grad_norm": 10.176100730895996, + "learning_rate": 6.235450890911927e-06, + "loss": 2.6385, + "step": 8447000 + }, + { + "epoch": 2.6260283788257714, + "grad_norm": 16.172832489013672, + "learning_rate": 6.2328603529038115e-06, + "loss": 2.5963, + "step": 8447500 + }, + { + "epoch": 2.6261838111062583, + "grad_norm": 7.962249279022217, + "learning_rate": 6.230269814895697e-06, + "loss": 2.6, + "step": 8448000 + }, + { + "epoch": 2.626339243386745, + "grad_norm": 8.896327018737793, + "learning_rate": 6.227679276887583e-06, + "loss": 2.5927, + "step": 8448500 + }, + { + "epoch": 2.626494675667232, + "grad_norm": 14.029471397399902, + "learning_rate": 6.225088738879469e-06, + "loss": 2.6138, + "step": 8449000 + }, + { + "epoch": 2.626650107947719, + "grad_norm": 11.213020324707031, + "learning_rate": 6.222498200871354e-06, + "loss": 2.6515, + "step": 8449500 + }, + { + "epoch": 2.626805540228206, + "grad_norm": 10.52305793762207, + "learning_rate": 6.219907662863239e-06, + "loss": 2.5711, + "step": 8450000 + }, + { + "epoch": 2.6269609725086926, + "grad_norm": 10.130638122558594, + "learning_rate": 6.217317124855124e-06, + "loss": 2.607, + "step": 8450500 + }, + { + "epoch": 2.6271164047891795, + "grad_norm": 8.852980613708496, + "learning_rate": 6.21472658684701e-06, + "loss": 2.6009, + "step": 8451000 + }, + { + "epoch": 2.6272718370696664, + "grad_norm": 10.323249816894531, + "learning_rate": 6.212136048838895e-06, + "loss": 2.6732, + "step": 8451500 + }, + { + "epoch": 2.6274272693501532, + "grad_norm": 11.483500480651855, + "learning_rate": 6.209545510830781e-06, + "loss": 2.5728, + "step": 8452000 + }, + { + "epoch": 2.62758270163064, + "grad_norm": 12.172883987426758, + "learning_rate": 6.206954972822666e-06, + "loss": 2.5785, + "step": 8452500 + }, + { + "epoch": 2.627738133911127, + "grad_norm": 12.419195175170898, + "learning_rate": 6.2043644348145516e-06, + "loss": 2.5726, + "step": 8453000 + }, + { + "epoch": 2.627893566191614, + "grad_norm": 9.896686553955078, + "learning_rate": 6.201773896806437e-06, + "loss": 2.5807, + "step": 8453500 + }, + { + "epoch": 2.6280489984721007, + "grad_norm": 9.96854305267334, + "learning_rate": 6.1991833587983225e-06, + "loss": 2.6228, + "step": 8454000 + }, + { + "epoch": 2.6282044307525876, + "grad_norm": 9.898765563964844, + "learning_rate": 6.196592820790208e-06, + "loss": 2.5745, + "step": 8454500 + }, + { + "epoch": 2.6283598630330745, + "grad_norm": 9.788101196289062, + "learning_rate": 6.194002282782093e-06, + "loss": 2.6244, + "step": 8455000 + }, + { + "epoch": 2.6285152953135613, + "grad_norm": 19.011140823364258, + "learning_rate": 6.191411744773978e-06, + "loss": 2.6148, + "step": 8455500 + }, + { + "epoch": 2.628670727594048, + "grad_norm": 9.886441230773926, + "learning_rate": 6.1888212067658635e-06, + "loss": 2.5742, + "step": 8456000 + }, + { + "epoch": 2.628826159874535, + "grad_norm": 9.638155937194824, + "learning_rate": 6.186230668757749e-06, + "loss": 2.6563, + "step": 8456500 + }, + { + "epoch": 2.628981592155022, + "grad_norm": 9.004475593566895, + "learning_rate": 6.183640130749634e-06, + "loss": 2.5918, + "step": 8457000 + }, + { + "epoch": 2.629137024435509, + "grad_norm": 13.675167083740234, + "learning_rate": 6.181049592741521e-06, + "loss": 2.5856, + "step": 8457500 + }, + { + "epoch": 2.6292924567159957, + "grad_norm": 9.335500717163086, + "learning_rate": 6.178459054733405e-06, + "loss": 2.5622, + "step": 8458000 + }, + { + "epoch": 2.6294478889964825, + "grad_norm": 10.343338966369629, + "learning_rate": 6.175868516725291e-06, + "loss": 2.5891, + "step": 8458500 + }, + { + "epoch": 2.6296033212769694, + "grad_norm": 6.623865127563477, + "learning_rate": 6.173277978717176e-06, + "loss": 2.5889, + "step": 8459000 + }, + { + "epoch": 2.6297587535574563, + "grad_norm": 16.268991470336914, + "learning_rate": 6.170687440709062e-06, + "loss": 2.6144, + "step": 8459500 + }, + { + "epoch": 2.629914185837943, + "grad_norm": 11.209172248840332, + "learning_rate": 6.168096902700947e-06, + "loss": 2.5763, + "step": 8460000 + }, + { + "epoch": 2.63006961811843, + "grad_norm": 12.235115051269531, + "learning_rate": 6.165506364692833e-06, + "loss": 2.5876, + "step": 8460500 + }, + { + "epoch": 2.630225050398917, + "grad_norm": 11.828835487365723, + "learning_rate": 6.162915826684717e-06, + "loss": 2.5367, + "step": 8461000 + }, + { + "epoch": 2.6303804826794037, + "grad_norm": 9.929150581359863, + "learning_rate": 6.160325288676603e-06, + "loss": 2.5952, + "step": 8461500 + }, + { + "epoch": 2.6305359149598906, + "grad_norm": 10.883347511291504, + "learning_rate": 6.157734750668489e-06, + "loss": 2.5932, + "step": 8462000 + }, + { + "epoch": 2.6306913472403775, + "grad_norm": 8.752660751342773, + "learning_rate": 6.1551442126603744e-06, + "loss": 2.5904, + "step": 8462500 + }, + { + "epoch": 2.6308467795208643, + "grad_norm": 9.52159309387207, + "learning_rate": 6.15255367465226e-06, + "loss": 2.562, + "step": 8463000 + }, + { + "epoch": 2.631002211801351, + "grad_norm": 10.018804550170898, + "learning_rate": 6.1499631366441445e-06, + "loss": 2.6365, + "step": 8463500 + }, + { + "epoch": 2.631157644081838, + "grad_norm": 9.796208381652832, + "learning_rate": 6.14737259863603e-06, + "loss": 2.6111, + "step": 8464000 + }, + { + "epoch": 2.631313076362325, + "grad_norm": 10.33725357055664, + "learning_rate": 6.1447820606279154e-06, + "loss": 2.6117, + "step": 8464500 + }, + { + "epoch": 2.6314685086428122, + "grad_norm": 12.399251937866211, + "learning_rate": 6.142191522619801e-06, + "loss": 2.5724, + "step": 8465000 + }, + { + "epoch": 2.6316239409232987, + "grad_norm": 13.035831451416016, + "learning_rate": 6.139600984611686e-06, + "loss": 2.5969, + "step": 8465500 + }, + { + "epoch": 2.631779373203786, + "grad_norm": 8.516242980957031, + "learning_rate": 6.137010446603572e-06, + "loss": 2.5729, + "step": 8466000 + }, + { + "epoch": 2.6319348054842724, + "grad_norm": 11.215677261352539, + "learning_rate": 6.134419908595457e-06, + "loss": 2.5523, + "step": 8466500 + }, + { + "epoch": 2.6320902377647597, + "grad_norm": 10.208416938781738, + "learning_rate": 6.131829370587343e-06, + "loss": 2.5784, + "step": 8467000 + }, + { + "epoch": 2.632245670045246, + "grad_norm": 10.050766944885254, + "learning_rate": 6.129238832579228e-06, + "loss": 2.5883, + "step": 8467500 + }, + { + "epoch": 2.6324011023257334, + "grad_norm": 7.957560062408447, + "learning_rate": 6.126648294571114e-06, + "loss": 2.5493, + "step": 8468000 + }, + { + "epoch": 2.63255653460622, + "grad_norm": 9.531746864318848, + "learning_rate": 6.124057756562999e-06, + "loss": 2.616, + "step": 8468500 + }, + { + "epoch": 2.632711966886707, + "grad_norm": 11.264124870300293, + "learning_rate": 6.121467218554884e-06, + "loss": 2.5797, + "step": 8469000 + }, + { + "epoch": 2.6328673991671936, + "grad_norm": 14.573711395263672, + "learning_rate": 6.118876680546769e-06, + "loss": 2.5944, + "step": 8469500 + }, + { + "epoch": 2.633022831447681, + "grad_norm": 9.074980735778809, + "learning_rate": 6.116286142538655e-06, + "loss": 2.5897, + "step": 8470000 + }, + { + "epoch": 2.6331782637281673, + "grad_norm": 11.013484954833984, + "learning_rate": 6.113695604530541e-06, + "loss": 2.562, + "step": 8470500 + }, + { + "epoch": 2.6333336960086546, + "grad_norm": 8.70943832397461, + "learning_rate": 6.111105066522426e-06, + "loss": 2.5824, + "step": 8471000 + }, + { + "epoch": 2.6334891282891415, + "grad_norm": 14.222054481506348, + "learning_rate": 6.108514528514311e-06, + "loss": 2.6108, + "step": 8471500 + }, + { + "epoch": 2.6336445605696284, + "grad_norm": 9.729911804199219, + "learning_rate": 6.1059239905061965e-06, + "loss": 2.6111, + "step": 8472000 + }, + { + "epoch": 2.6337999928501152, + "grad_norm": 9.416346549987793, + "learning_rate": 6.103333452498082e-06, + "loss": 2.5957, + "step": 8472500 + }, + { + "epoch": 2.633955425130602, + "grad_norm": 12.407397270202637, + "learning_rate": 6.100742914489967e-06, + "loss": 2.6015, + "step": 8473000 + }, + { + "epoch": 2.634110857411089, + "grad_norm": 8.981066703796387, + "learning_rate": 6.098152376481853e-06, + "loss": 2.6017, + "step": 8473500 + }, + { + "epoch": 2.634266289691576, + "grad_norm": 10.589216232299805, + "learning_rate": 6.095561838473738e-06, + "loss": 2.5991, + "step": 8474000 + }, + { + "epoch": 2.6344217219720627, + "grad_norm": 7.629467487335205, + "learning_rate": 6.092971300465623e-06, + "loss": 2.636, + "step": 8474500 + }, + { + "epoch": 2.6345771542525496, + "grad_norm": 44.37935256958008, + "learning_rate": 6.090380762457509e-06, + "loss": 2.6294, + "step": 8475000 + }, + { + "epoch": 2.6347325865330364, + "grad_norm": 10.719644546508789, + "learning_rate": 6.087790224449395e-06, + "loss": 2.5681, + "step": 8475500 + }, + { + "epoch": 2.6348880188135233, + "grad_norm": 8.966079711914062, + "learning_rate": 6.08519968644128e-06, + "loss": 2.5742, + "step": 8476000 + }, + { + "epoch": 2.63504345109401, + "grad_norm": 14.379152297973633, + "learning_rate": 6.082609148433166e-06, + "loss": 2.5733, + "step": 8476500 + }, + { + "epoch": 2.635198883374497, + "grad_norm": 12.885276794433594, + "learning_rate": 6.08001861042505e-06, + "loss": 2.6034, + "step": 8477000 + }, + { + "epoch": 2.635354315654984, + "grad_norm": 9.446037292480469, + "learning_rate": 6.077428072416936e-06, + "loss": 2.5672, + "step": 8477500 + }, + { + "epoch": 2.6355097479354708, + "grad_norm": 9.91387939453125, + "learning_rate": 6.074837534408821e-06, + "loss": 2.5632, + "step": 8478000 + }, + { + "epoch": 2.6356651802159576, + "grad_norm": 9.20536994934082, + "learning_rate": 6.072246996400707e-06, + "loss": 2.5432, + "step": 8478500 + }, + { + "epoch": 2.6358206124964445, + "grad_norm": 8.406197547912598, + "learning_rate": 6.069656458392592e-06, + "loss": 2.5814, + "step": 8479000 + }, + { + "epoch": 2.6359760447769314, + "grad_norm": 10.512687683105469, + "learning_rate": 6.0670659203844775e-06, + "loss": 2.6075, + "step": 8479500 + }, + { + "epoch": 2.6361314770574182, + "grad_norm": 34.18730545043945, + "learning_rate": 6.064475382376363e-06, + "loss": 2.567, + "step": 8480000 + }, + { + "epoch": 2.636286909337905, + "grad_norm": 9.898353576660156, + "learning_rate": 6.0618848443682484e-06, + "loss": 2.6428, + "step": 8480500 + }, + { + "epoch": 2.636442341618392, + "grad_norm": 8.682573318481445, + "learning_rate": 6.059294306360134e-06, + "loss": 2.6071, + "step": 8481000 + }, + { + "epoch": 2.636597773898879, + "grad_norm": 8.062294960021973, + "learning_rate": 6.056703768352019e-06, + "loss": 2.63, + "step": 8481500 + }, + { + "epoch": 2.6367532061793657, + "grad_norm": 11.112706184387207, + "learning_rate": 6.054113230343905e-06, + "loss": 2.5788, + "step": 8482000 + }, + { + "epoch": 2.6369086384598526, + "grad_norm": 8.85234260559082, + "learning_rate": 6.051522692335789e-06, + "loss": 2.5771, + "step": 8482500 + }, + { + "epoch": 2.6370640707403394, + "grad_norm": 9.189953804016113, + "learning_rate": 6.048932154327675e-06, + "loss": 2.6042, + "step": 8483000 + }, + { + "epoch": 2.6372195030208263, + "grad_norm": 10.890737533569336, + "learning_rate": 6.04634161631956e-06, + "loss": 2.5884, + "step": 8483500 + }, + { + "epoch": 2.637374935301313, + "grad_norm": 12.570712089538574, + "learning_rate": 6.043751078311447e-06, + "loss": 2.6022, + "step": 8484000 + }, + { + "epoch": 2.6375303675818, + "grad_norm": 10.175236701965332, + "learning_rate": 6.041160540303332e-06, + "loss": 2.6213, + "step": 8484500 + }, + { + "epoch": 2.637685799862287, + "grad_norm": 9.731078147888184, + "learning_rate": 6.038570002295217e-06, + "loss": 2.6005, + "step": 8485000 + }, + { + "epoch": 2.6378412321427738, + "grad_norm": 9.915349960327148, + "learning_rate": 6.035979464287102e-06, + "loss": 2.6694, + "step": 8485500 + }, + { + "epoch": 2.6379966644232606, + "grad_norm": 51.02454376220703, + "learning_rate": 6.033388926278988e-06, + "loss": 2.6241, + "step": 8486000 + }, + { + "epoch": 2.6381520967037475, + "grad_norm": 10.29944896697998, + "learning_rate": 6.030798388270873e-06, + "loss": 2.5796, + "step": 8486500 + }, + { + "epoch": 2.6383075289842344, + "grad_norm": 10.786234855651855, + "learning_rate": 6.0282078502627586e-06, + "loss": 2.5828, + "step": 8487000 + }, + { + "epoch": 2.6384629612647212, + "grad_norm": 11.526188850402832, + "learning_rate": 6.025617312254644e-06, + "loss": 2.622, + "step": 8487500 + }, + { + "epoch": 2.638618393545208, + "grad_norm": 13.037201881408691, + "learning_rate": 6.0230267742465295e-06, + "loss": 2.6051, + "step": 8488000 + }, + { + "epoch": 2.638773825825695, + "grad_norm": 9.41513729095459, + "learning_rate": 6.020436236238415e-06, + "loss": 2.5538, + "step": 8488500 + }, + { + "epoch": 2.638929258106182, + "grad_norm": 7.733190059661865, + "learning_rate": 6.0178456982303e-06, + "loss": 2.603, + "step": 8489000 + }, + { + "epoch": 2.639084690386669, + "grad_norm": 9.623026847839355, + "learning_rate": 6.015255160222186e-06, + "loss": 2.5723, + "step": 8489500 + }, + { + "epoch": 2.6392401226671556, + "grad_norm": 9.439891815185547, + "learning_rate": 6.012664622214071e-06, + "loss": 2.5357, + "step": 8490000 + }, + { + "epoch": 2.639395554947643, + "grad_norm": 9.703368186950684, + "learning_rate": 6.010074084205957e-06, + "loss": 2.5571, + "step": 8490500 + }, + { + "epoch": 2.6395509872281293, + "grad_norm": 11.247907638549805, + "learning_rate": 6.007483546197841e-06, + "loss": 2.6165, + "step": 8491000 + }, + { + "epoch": 2.6397064195086166, + "grad_norm": 19.2455997467041, + "learning_rate": 6.004893008189727e-06, + "loss": 2.6132, + "step": 8491500 + }, + { + "epoch": 2.639861851789103, + "grad_norm": 8.582197189331055, + "learning_rate": 6.002302470181612e-06, + "loss": 2.5826, + "step": 8492000 + }, + { + "epoch": 2.6400172840695904, + "grad_norm": 9.435590744018555, + "learning_rate": 5.999711932173498e-06, + "loss": 2.5849, + "step": 8492500 + }, + { + "epoch": 2.6401727163500768, + "grad_norm": 13.047743797302246, + "learning_rate": 5.997121394165384e-06, + "loss": 2.614, + "step": 8493000 + }, + { + "epoch": 2.640328148630564, + "grad_norm": 8.75361156463623, + "learning_rate": 5.994530856157269e-06, + "loss": 2.5749, + "step": 8493500 + }, + { + "epoch": 2.6404835809110505, + "grad_norm": 16.05527687072754, + "learning_rate": 5.991940318149154e-06, + "loss": 2.5562, + "step": 8494000 + }, + { + "epoch": 2.640639013191538, + "grad_norm": 9.109619140625, + "learning_rate": 5.98934978014104e-06, + "loss": 2.611, + "step": 8494500 + }, + { + "epoch": 2.6407944454720247, + "grad_norm": 10.865398406982422, + "learning_rate": 5.986759242132925e-06, + "loss": 2.583, + "step": 8495000 + }, + { + "epoch": 2.6409498777525116, + "grad_norm": 13.3833589553833, + "learning_rate": 5.9841687041248105e-06, + "loss": 2.6065, + "step": 8495500 + }, + { + "epoch": 2.6411053100329984, + "grad_norm": 12.507015228271484, + "learning_rate": 5.981578166116696e-06, + "loss": 2.5493, + "step": 8496000 + }, + { + "epoch": 2.6412607423134853, + "grad_norm": 8.691424369812012, + "learning_rate": 5.978987628108581e-06, + "loss": 2.5624, + "step": 8496500 + }, + { + "epoch": 2.641416174593972, + "grad_norm": 1773.4337158203125, + "learning_rate": 5.976397090100466e-06, + "loss": 2.6154, + "step": 8497000 + }, + { + "epoch": 2.641571606874459, + "grad_norm": 10.062753677368164, + "learning_rate": 5.973806552092352e-06, + "loss": 2.5751, + "step": 8497500 + }, + { + "epoch": 2.641727039154946, + "grad_norm": 13.627091407775879, + "learning_rate": 5.971216014084238e-06, + "loss": 2.5775, + "step": 8498000 + }, + { + "epoch": 2.6418824714354328, + "grad_norm": 10.415108680725098, + "learning_rate": 5.968625476076123e-06, + "loss": 2.5847, + "step": 8498500 + }, + { + "epoch": 2.6420379037159196, + "grad_norm": 12.580066680908203, + "learning_rate": 5.966034938068008e-06, + "loss": 2.6051, + "step": 8499000 + }, + { + "epoch": 2.6421933359964065, + "grad_norm": 12.882216453552246, + "learning_rate": 5.963444400059893e-06, + "loss": 2.6201, + "step": 8499500 + }, + { + "epoch": 2.6423487682768934, + "grad_norm": 9.410850524902344, + "learning_rate": 5.960853862051779e-06, + "loss": 2.5927, + "step": 8500000 + }, + { + "epoch": 2.6425042005573802, + "grad_norm": 17.61934471130371, + "learning_rate": 5.958263324043664e-06, + "loss": 2.5739, + "step": 8500500 + }, + { + "epoch": 2.642659632837867, + "grad_norm": 9.416830062866211, + "learning_rate": 5.95567278603555e-06, + "loss": 2.5707, + "step": 8501000 + }, + { + "epoch": 2.642815065118354, + "grad_norm": 9.922601699829102, + "learning_rate": 5.953082248027435e-06, + "loss": 2.6127, + "step": 8501500 + }, + { + "epoch": 2.642970497398841, + "grad_norm": 9.579622268676758, + "learning_rate": 5.950491710019321e-06, + "loss": 2.5775, + "step": 8502000 + }, + { + "epoch": 2.6431259296793277, + "grad_norm": 8.247016906738281, + "learning_rate": 5.947901172011206e-06, + "loss": 2.5939, + "step": 8502500 + }, + { + "epoch": 2.6432813619598146, + "grad_norm": 9.411947250366211, + "learning_rate": 5.9453106340030916e-06, + "loss": 2.5961, + "step": 8503000 + }, + { + "epoch": 2.6434367942403014, + "grad_norm": 15.3934907913208, + "learning_rate": 5.942720095994977e-06, + "loss": 2.5712, + "step": 8503500 + }, + { + "epoch": 2.6435922265207883, + "grad_norm": 11.941292762756348, + "learning_rate": 5.9401295579868625e-06, + "loss": 2.594, + "step": 8504000 + }, + { + "epoch": 2.643747658801275, + "grad_norm": 8.143333435058594, + "learning_rate": 5.937539019978747e-06, + "loss": 2.6135, + "step": 8504500 + }, + { + "epoch": 2.643903091081762, + "grad_norm": 8.008056640625, + "learning_rate": 5.9349484819706325e-06, + "loss": 2.5932, + "step": 8505000 + }, + { + "epoch": 2.644058523362249, + "grad_norm": 15.704654693603516, + "learning_rate": 5.932357943962518e-06, + "loss": 2.5967, + "step": 8505500 + }, + { + "epoch": 2.6442139556427358, + "grad_norm": 9.080280303955078, + "learning_rate": 5.9297674059544035e-06, + "loss": 2.5535, + "step": 8506000 + }, + { + "epoch": 2.6443693879232226, + "grad_norm": 14.377503395080566, + "learning_rate": 5.92717686794629e-06, + "loss": 2.5452, + "step": 8506500 + }, + { + "epoch": 2.6445248202037095, + "grad_norm": 13.637474060058594, + "learning_rate": 5.924586329938174e-06, + "loss": 2.571, + "step": 8507000 + }, + { + "epoch": 2.6446802524841964, + "grad_norm": 9.483092308044434, + "learning_rate": 5.92199579193006e-06, + "loss": 2.5876, + "step": 8507500 + }, + { + "epoch": 2.6448356847646832, + "grad_norm": 9.67770004272461, + "learning_rate": 5.919405253921945e-06, + "loss": 2.5682, + "step": 8508000 + }, + { + "epoch": 2.64499111704517, + "grad_norm": 7.698638439178467, + "learning_rate": 5.916814715913831e-06, + "loss": 2.6377, + "step": 8508500 + }, + { + "epoch": 2.645146549325657, + "grad_norm": 11.236478805541992, + "learning_rate": 5.914224177905716e-06, + "loss": 2.5907, + "step": 8509000 + }, + { + "epoch": 2.645301981606144, + "grad_norm": 13.39370059967041, + "learning_rate": 5.911633639897602e-06, + "loss": 2.5944, + "step": 8509500 + }, + { + "epoch": 2.6454574138866307, + "grad_norm": 11.227947235107422, + "learning_rate": 5.909043101889486e-06, + "loss": 2.6092, + "step": 8510000 + }, + { + "epoch": 2.6456128461671176, + "grad_norm": 11.217924118041992, + "learning_rate": 5.906452563881372e-06, + "loss": 2.636, + "step": 8510500 + }, + { + "epoch": 2.6457682784476044, + "grad_norm": 11.961922645568848, + "learning_rate": 5.903862025873258e-06, + "loss": 2.6515, + "step": 8511000 + }, + { + "epoch": 2.6459237107280913, + "grad_norm": 9.679652214050293, + "learning_rate": 5.9012714878651435e-06, + "loss": 2.5826, + "step": 8511500 + }, + { + "epoch": 2.646079143008578, + "grad_norm": 8.810840606689453, + "learning_rate": 5.898680949857029e-06, + "loss": 2.5589, + "step": 8512000 + }, + { + "epoch": 2.646234575289065, + "grad_norm": 11.473957061767578, + "learning_rate": 5.896090411848914e-06, + "loss": 2.6247, + "step": 8512500 + }, + { + "epoch": 2.646390007569552, + "grad_norm": 9.284614562988281, + "learning_rate": 5.893499873840799e-06, + "loss": 2.578, + "step": 8513000 + }, + { + "epoch": 2.6465454398500388, + "grad_norm": 10.433924674987793, + "learning_rate": 5.8909093358326845e-06, + "loss": 2.6126, + "step": 8513500 + }, + { + "epoch": 2.646700872130526, + "grad_norm": 10.861682891845703, + "learning_rate": 5.88831879782457e-06, + "loss": 2.5863, + "step": 8514000 + }, + { + "epoch": 2.6468563044110125, + "grad_norm": 11.566740036010742, + "learning_rate": 5.8857282598164554e-06, + "loss": 2.6007, + "step": 8514500 + }, + { + "epoch": 2.6470117366915, + "grad_norm": 12.513912200927734, + "learning_rate": 5.883137721808341e-06, + "loss": 2.6129, + "step": 8515000 + }, + { + "epoch": 2.6471671689719862, + "grad_norm": 10.548788070678711, + "learning_rate": 5.880547183800226e-06, + "loss": 2.597, + "step": 8515500 + }, + { + "epoch": 2.6473226012524735, + "grad_norm": 22.411590576171875, + "learning_rate": 5.877956645792112e-06, + "loss": 2.6281, + "step": 8516000 + }, + { + "epoch": 2.64747803353296, + "grad_norm": 12.317623138427734, + "learning_rate": 5.875366107783997e-06, + "loss": 2.6212, + "step": 8516500 + }, + { + "epoch": 2.6476334658134473, + "grad_norm": 11.626374244689941, + "learning_rate": 5.872775569775883e-06, + "loss": 2.5687, + "step": 8517000 + }, + { + "epoch": 2.6477888980939337, + "grad_norm": 10.69851303100586, + "learning_rate": 5.870185031767768e-06, + "loss": 2.6137, + "step": 8517500 + }, + { + "epoch": 2.647944330374421, + "grad_norm": 9.309576034545898, + "learning_rate": 5.867594493759653e-06, + "loss": 2.5802, + "step": 8518000 + }, + { + "epoch": 2.6480997626549074, + "grad_norm": 9.99787425994873, + "learning_rate": 5.865003955751538e-06, + "loss": 2.6021, + "step": 8518500 + }, + { + "epoch": 2.6482551949353947, + "grad_norm": 12.457841873168945, + "learning_rate": 5.862413417743424e-06, + "loss": 2.5452, + "step": 8519000 + }, + { + "epoch": 2.6484106272158816, + "grad_norm": 12.110578536987305, + "learning_rate": 5.859822879735309e-06, + "loss": 2.6109, + "step": 8519500 + }, + { + "epoch": 2.6485660594963685, + "grad_norm": 12.005568504333496, + "learning_rate": 5.8572323417271955e-06, + "loss": 2.5834, + "step": 8520000 + }, + { + "epoch": 2.6487214917768553, + "grad_norm": 10.831884384155273, + "learning_rate": 5.85464180371908e-06, + "loss": 2.5778, + "step": 8520500 + }, + { + "epoch": 2.648876924057342, + "grad_norm": 9.345959663391113, + "learning_rate": 5.8520512657109656e-06, + "loss": 2.5839, + "step": 8521000 + }, + { + "epoch": 2.649032356337829, + "grad_norm": 11.479436874389648, + "learning_rate": 5.849460727702851e-06, + "loss": 2.5853, + "step": 8521500 + }, + { + "epoch": 2.649187788618316, + "grad_norm": 14.292621612548828, + "learning_rate": 5.8468701896947365e-06, + "loss": 2.5717, + "step": 8522000 + }, + { + "epoch": 2.649343220898803, + "grad_norm": 12.22812271118164, + "learning_rate": 5.844279651686622e-06, + "loss": 2.5829, + "step": 8522500 + }, + { + "epoch": 2.6494986531792897, + "grad_norm": 14.018041610717773, + "learning_rate": 5.841689113678507e-06, + "loss": 2.6078, + "step": 8523000 + }, + { + "epoch": 2.6496540854597765, + "grad_norm": 14.722749710083008, + "learning_rate": 5.839098575670392e-06, + "loss": 2.6106, + "step": 8523500 + }, + { + "epoch": 2.6498095177402634, + "grad_norm": 9.107206344604492, + "learning_rate": 5.8365080376622775e-06, + "loss": 2.5671, + "step": 8524000 + }, + { + "epoch": 2.6499649500207503, + "grad_norm": 9.960153579711914, + "learning_rate": 5.833917499654164e-06, + "loss": 2.5917, + "step": 8524500 + }, + { + "epoch": 2.650120382301237, + "grad_norm": 10.78860092163086, + "learning_rate": 5.831326961646049e-06, + "loss": 2.5733, + "step": 8525000 + }, + { + "epoch": 2.650275814581724, + "grad_norm": 12.389512062072754, + "learning_rate": 5.828736423637935e-06, + "loss": 2.5662, + "step": 8525500 + }, + { + "epoch": 2.650431246862211, + "grad_norm": 8.755793571472168, + "learning_rate": 5.826145885629819e-06, + "loss": 2.5817, + "step": 8526000 + }, + { + "epoch": 2.6505866791426977, + "grad_norm": 9.195562362670898, + "learning_rate": 5.823555347621705e-06, + "loss": 2.6198, + "step": 8526500 + }, + { + "epoch": 2.6507421114231846, + "grad_norm": 9.613353729248047, + "learning_rate": 5.82096480961359e-06, + "loss": 2.5617, + "step": 8527000 + }, + { + "epoch": 2.6508975437036715, + "grad_norm": 10.875463485717773, + "learning_rate": 5.818374271605476e-06, + "loss": 2.5819, + "step": 8527500 + }, + { + "epoch": 2.6510529759841583, + "grad_norm": 12.212096214294434, + "learning_rate": 5.815783733597361e-06, + "loss": 2.6055, + "step": 8528000 + }, + { + "epoch": 2.651208408264645, + "grad_norm": 10.280614852905273, + "learning_rate": 5.813193195589247e-06, + "loss": 2.6011, + "step": 8528500 + }, + { + "epoch": 2.651363840545132, + "grad_norm": 11.43324089050293, + "learning_rate": 5.810602657581132e-06, + "loss": 2.6061, + "step": 8529000 + }, + { + "epoch": 2.651519272825619, + "grad_norm": 10.907038688659668, + "learning_rate": 5.8080121195730175e-06, + "loss": 2.6112, + "step": 8529500 + }, + { + "epoch": 2.651674705106106, + "grad_norm": 15.10352897644043, + "learning_rate": 5.805421581564903e-06, + "loss": 2.6161, + "step": 8530000 + }, + { + "epoch": 2.6518301373865927, + "grad_norm": 47.7629508972168, + "learning_rate": 5.8028310435567884e-06, + "loss": 2.6228, + "step": 8530500 + }, + { + "epoch": 2.6519855696670795, + "grad_norm": 8.490106582641602, + "learning_rate": 5.800240505548674e-06, + "loss": 2.561, + "step": 8531000 + }, + { + "epoch": 2.6521410019475664, + "grad_norm": 11.379474639892578, + "learning_rate": 5.7976499675405585e-06, + "loss": 2.6217, + "step": 8531500 + }, + { + "epoch": 2.6522964342280533, + "grad_norm": 12.606451988220215, + "learning_rate": 5.795059429532444e-06, + "loss": 2.5977, + "step": 8532000 + }, + { + "epoch": 2.65245186650854, + "grad_norm": 8.482938766479492, + "learning_rate": 5.792468891524329e-06, + "loss": 2.5697, + "step": 8532500 + }, + { + "epoch": 2.652607298789027, + "grad_norm": 17.63420867919922, + "learning_rate": 5.789878353516216e-06, + "loss": 2.5602, + "step": 8533000 + }, + { + "epoch": 2.652762731069514, + "grad_norm": 9.563201904296875, + "learning_rate": 5.787287815508101e-06, + "loss": 2.5411, + "step": 8533500 + }, + { + "epoch": 2.6529181633500007, + "grad_norm": 11.560141563415527, + "learning_rate": 5.784697277499986e-06, + "loss": 2.5918, + "step": 8534000 + }, + { + "epoch": 2.6530735956304876, + "grad_norm": 13.264717102050781, + "learning_rate": 5.782106739491871e-06, + "loss": 2.5599, + "step": 8534500 + }, + { + "epoch": 2.6532290279109745, + "grad_norm": 20.345867156982422, + "learning_rate": 5.779516201483757e-06, + "loss": 2.5799, + "step": 8535000 + }, + { + "epoch": 2.6533844601914613, + "grad_norm": 11.417458534240723, + "learning_rate": 5.776925663475642e-06, + "loss": 2.5946, + "step": 8535500 + }, + { + "epoch": 2.653539892471948, + "grad_norm": 9.159534454345703, + "learning_rate": 5.774335125467528e-06, + "loss": 2.5728, + "step": 8536000 + }, + { + "epoch": 2.653695324752435, + "grad_norm": 9.28123950958252, + "learning_rate": 5.771744587459413e-06, + "loss": 2.6015, + "step": 8536500 + }, + { + "epoch": 2.653850757032922, + "grad_norm": 9.245747566223145, + "learning_rate": 5.7691540494512986e-06, + "loss": 2.5991, + "step": 8537000 + }, + { + "epoch": 2.6540061893134093, + "grad_norm": 8.93964672088623, + "learning_rate": 5.766563511443184e-06, + "loss": 2.522, + "step": 8537500 + }, + { + "epoch": 2.6541616215938957, + "grad_norm": 10.295464515686035, + "learning_rate": 5.7639729734350695e-06, + "loss": 2.6135, + "step": 8538000 + }, + { + "epoch": 2.654317053874383, + "grad_norm": 14.444436073303223, + "learning_rate": 5.761382435426955e-06, + "loss": 2.579, + "step": 8538500 + }, + { + "epoch": 2.6544724861548694, + "grad_norm": 10.425363540649414, + "learning_rate": 5.75879189741884e-06, + "loss": 2.6108, + "step": 8539000 + }, + { + "epoch": 2.6546279184353567, + "grad_norm": 9.710770606994629, + "learning_rate": 5.756201359410726e-06, + "loss": 2.5963, + "step": 8539500 + }, + { + "epoch": 2.654783350715843, + "grad_norm": 13.070207595825195, + "learning_rate": 5.7536108214026105e-06, + "loss": 2.5838, + "step": 8540000 + }, + { + "epoch": 2.6549387829963305, + "grad_norm": 31.138628005981445, + "learning_rate": 5.751020283394496e-06, + "loss": 2.5994, + "step": 8540500 + }, + { + "epoch": 2.655094215276817, + "grad_norm": 8.158150672912598, + "learning_rate": 5.748429745386381e-06, + "loss": 2.5977, + "step": 8541000 + }, + { + "epoch": 2.655249647557304, + "grad_norm": 11.139362335205078, + "learning_rate": 5.745839207378267e-06, + "loss": 2.5872, + "step": 8541500 + }, + { + "epoch": 2.6554050798377906, + "grad_norm": 9.38013744354248, + "learning_rate": 5.743248669370153e-06, + "loss": 2.5499, + "step": 8542000 + }, + { + "epoch": 2.655560512118278, + "grad_norm": 9.057130813598633, + "learning_rate": 5.740658131362038e-06, + "loss": 2.572, + "step": 8542500 + }, + { + "epoch": 2.6557159443987643, + "grad_norm": 11.743596076965332, + "learning_rate": 5.738067593353923e-06, + "loss": 2.576, + "step": 8543000 + }, + { + "epoch": 2.6558713766792517, + "grad_norm": 39.1728630065918, + "learning_rate": 5.735477055345809e-06, + "loss": 2.5968, + "step": 8543500 + }, + { + "epoch": 2.6560268089597385, + "grad_norm": 10.964502334594727, + "learning_rate": 5.732886517337694e-06, + "loss": 2.5808, + "step": 8544000 + }, + { + "epoch": 2.6561822412402254, + "grad_norm": 12.306660652160645, + "learning_rate": 5.73029597932958e-06, + "loss": 2.6021, + "step": 8544500 + }, + { + "epoch": 2.6563376735207123, + "grad_norm": 9.665386199951172, + "learning_rate": 5.727705441321465e-06, + "loss": 2.6308, + "step": 8545000 + }, + { + "epoch": 2.656493105801199, + "grad_norm": 10.624883651733398, + "learning_rate": 5.72511490331335e-06, + "loss": 2.5992, + "step": 8545500 + }, + { + "epoch": 2.656648538081686, + "grad_norm": 10.044617652893066, + "learning_rate": 5.722524365305235e-06, + "loss": 2.5839, + "step": 8546000 + }, + { + "epoch": 2.656803970362173, + "grad_norm": 16.073734283447266, + "learning_rate": 5.7199338272971214e-06, + "loss": 2.5974, + "step": 8546500 + }, + { + "epoch": 2.6569594026426597, + "grad_norm": 12.653203964233398, + "learning_rate": 5.717343289289007e-06, + "loss": 2.6211, + "step": 8547000 + }, + { + "epoch": 2.6571148349231466, + "grad_norm": 8.646342277526855, + "learning_rate": 5.714752751280892e-06, + "loss": 2.6668, + "step": 8547500 + }, + { + "epoch": 2.6572702672036335, + "grad_norm": 18.29312515258789, + "learning_rate": 5.712162213272777e-06, + "loss": 2.5637, + "step": 8548000 + }, + { + "epoch": 2.6574256994841203, + "grad_norm": 9.066051483154297, + "learning_rate": 5.7095716752646624e-06, + "loss": 2.5797, + "step": 8548500 + }, + { + "epoch": 2.657581131764607, + "grad_norm": 10.72374439239502, + "learning_rate": 5.706981137256548e-06, + "loss": 2.6181, + "step": 8549000 + }, + { + "epoch": 2.657736564045094, + "grad_norm": 8.859230995178223, + "learning_rate": 5.704390599248433e-06, + "loss": 2.561, + "step": 8549500 + }, + { + "epoch": 2.657891996325581, + "grad_norm": 11.91225814819336, + "learning_rate": 5.701800061240319e-06, + "loss": 2.5794, + "step": 8550000 + }, + { + "epoch": 2.658047428606068, + "grad_norm": 6.846076965332031, + "learning_rate": 5.699209523232204e-06, + "loss": 2.6461, + "step": 8550500 + }, + { + "epoch": 2.6582028608865547, + "grad_norm": 11.28172492980957, + "learning_rate": 5.69661898522409e-06, + "loss": 2.5361, + "step": 8551000 + }, + { + "epoch": 2.6583582931670415, + "grad_norm": 36.28239059448242, + "learning_rate": 5.694028447215975e-06, + "loss": 2.5875, + "step": 8551500 + }, + { + "epoch": 2.6585137254475284, + "grad_norm": 10.362607955932617, + "learning_rate": 5.691437909207861e-06, + "loss": 2.5761, + "step": 8552000 + }, + { + "epoch": 2.6586691577280153, + "grad_norm": 10.314974784851074, + "learning_rate": 5.688847371199746e-06, + "loss": 2.5614, + "step": 8552500 + }, + { + "epoch": 2.658824590008502, + "grad_norm": 26.393768310546875, + "learning_rate": 5.6862568331916316e-06, + "loss": 2.6192, + "step": 8553000 + }, + { + "epoch": 2.658980022288989, + "grad_norm": 10.471353530883789, + "learning_rate": 5.683666295183516e-06, + "loss": 2.5834, + "step": 8553500 + }, + { + "epoch": 2.659135454569476, + "grad_norm": 13.900456428527832, + "learning_rate": 5.681075757175402e-06, + "loss": 2.5741, + "step": 8554000 + }, + { + "epoch": 2.6592908868499627, + "grad_norm": 11.688100814819336, + "learning_rate": 5.678485219167287e-06, + "loss": 2.557, + "step": 8554500 + }, + { + "epoch": 2.6594463191304496, + "grad_norm": 22.662540435791016, + "learning_rate": 5.6758946811591726e-06, + "loss": 2.5361, + "step": 8555000 + }, + { + "epoch": 2.6596017514109365, + "grad_norm": 13.008201599121094, + "learning_rate": 5.673304143151059e-06, + "loss": 2.5683, + "step": 8555500 + }, + { + "epoch": 2.6597571836914233, + "grad_norm": 10.755781173706055, + "learning_rate": 5.6707136051429435e-06, + "loss": 2.5389, + "step": 8556000 + }, + { + "epoch": 2.65991261597191, + "grad_norm": 11.579050064086914, + "learning_rate": 5.668123067134829e-06, + "loss": 2.5584, + "step": 8556500 + }, + { + "epoch": 2.660068048252397, + "grad_norm": 9.82162857055664, + "learning_rate": 5.665532529126714e-06, + "loss": 2.5844, + "step": 8557000 + }, + { + "epoch": 2.660223480532884, + "grad_norm": 9.181156158447266, + "learning_rate": 5.6629419911186e-06, + "loss": 2.6287, + "step": 8557500 + }, + { + "epoch": 2.660378912813371, + "grad_norm": 7.848913192749023, + "learning_rate": 5.660351453110485e-06, + "loss": 2.5667, + "step": 8558000 + }, + { + "epoch": 2.6605343450938577, + "grad_norm": 11.068767547607422, + "learning_rate": 5.657760915102371e-06, + "loss": 2.5337, + "step": 8558500 + }, + { + "epoch": 2.6606897773743445, + "grad_norm": 8.909594535827637, + "learning_rate": 5.655170377094255e-06, + "loss": 2.5741, + "step": 8559000 + }, + { + "epoch": 2.6608452096548314, + "grad_norm": 19.094600677490234, + "learning_rate": 5.652579839086141e-06, + "loss": 2.596, + "step": 8559500 + }, + { + "epoch": 2.6610006419353183, + "grad_norm": 9.432587623596191, + "learning_rate": 5.649989301078027e-06, + "loss": 2.5883, + "step": 8560000 + }, + { + "epoch": 2.661156074215805, + "grad_norm": 10.227919578552246, + "learning_rate": 5.647398763069913e-06, + "loss": 2.581, + "step": 8560500 + }, + { + "epoch": 2.661311506496292, + "grad_norm": 10.353900909423828, + "learning_rate": 5.644808225061798e-06, + "loss": 2.5555, + "step": 8561000 + }, + { + "epoch": 2.661466938776779, + "grad_norm": 9.811572074890137, + "learning_rate": 5.642217687053683e-06, + "loss": 2.5798, + "step": 8561500 + }, + { + "epoch": 2.661622371057266, + "grad_norm": 10.281771659851074, + "learning_rate": 5.639627149045568e-06, + "loss": 2.5886, + "step": 8562000 + }, + { + "epoch": 2.6617778033377526, + "grad_norm": 6.42351770401001, + "learning_rate": 5.637036611037454e-06, + "loss": 2.6049, + "step": 8562500 + }, + { + "epoch": 2.66193323561824, + "grad_norm": 9.735358238220215, + "learning_rate": 5.634446073029339e-06, + "loss": 2.5826, + "step": 8563000 + }, + { + "epoch": 2.6620886678987263, + "grad_norm": 22.35560417175293, + "learning_rate": 5.6318555350212245e-06, + "loss": 2.5398, + "step": 8563500 + }, + { + "epoch": 2.6622441001792136, + "grad_norm": 9.646799087524414, + "learning_rate": 5.62926499701311e-06, + "loss": 2.5858, + "step": 8564000 + }, + { + "epoch": 2.6623995324597, + "grad_norm": 11.52233600616455, + "learning_rate": 5.6266744590049954e-06, + "loss": 2.5786, + "step": 8564500 + }, + { + "epoch": 2.6625549647401874, + "grad_norm": 8.770572662353516, + "learning_rate": 5.624083920996881e-06, + "loss": 2.5655, + "step": 8565000 + }, + { + "epoch": 2.662710397020674, + "grad_norm": 10.038788795471191, + "learning_rate": 5.621493382988766e-06, + "loss": 2.5743, + "step": 8565500 + }, + { + "epoch": 2.662865829301161, + "grad_norm": 17.667009353637695, + "learning_rate": 5.618902844980652e-06, + "loss": 2.5725, + "step": 8566000 + }, + { + "epoch": 2.6630212615816475, + "grad_norm": 8.72416877746582, + "learning_rate": 5.616312306972537e-06, + "loss": 2.5521, + "step": 8566500 + }, + { + "epoch": 2.663176693862135, + "grad_norm": 11.226224899291992, + "learning_rate": 5.613721768964422e-06, + "loss": 2.6038, + "step": 8567000 + }, + { + "epoch": 2.6633321261426217, + "grad_norm": 9.79393482208252, + "learning_rate": 5.611131230956307e-06, + "loss": 2.6192, + "step": 8567500 + }, + { + "epoch": 2.6634875584231086, + "grad_norm": 9.396702766418457, + "learning_rate": 5.608540692948193e-06, + "loss": 2.5534, + "step": 8568000 + }, + { + "epoch": 2.6636429907035954, + "grad_norm": 10.670044898986816, + "learning_rate": 5.605950154940078e-06, + "loss": 2.6332, + "step": 8568500 + }, + { + "epoch": 2.6637984229840823, + "grad_norm": 8.61092758178711, + "learning_rate": 5.6033596169319646e-06, + "loss": 2.5619, + "step": 8569000 + }, + { + "epoch": 2.663953855264569, + "grad_norm": 10.29383373260498, + "learning_rate": 5.600769078923849e-06, + "loss": 2.5536, + "step": 8569500 + }, + { + "epoch": 2.664109287545056, + "grad_norm": 8.161858558654785, + "learning_rate": 5.598178540915735e-06, + "loss": 2.5933, + "step": 8570000 + }, + { + "epoch": 2.664264719825543, + "grad_norm": 11.069231033325195, + "learning_rate": 5.59558800290762e-06, + "loss": 2.5633, + "step": 8570500 + }, + { + "epoch": 2.66442015210603, + "grad_norm": 12.502957344055176, + "learning_rate": 5.5929974648995056e-06, + "loss": 2.5457, + "step": 8571000 + }, + { + "epoch": 2.6645755843865166, + "grad_norm": 8.638686180114746, + "learning_rate": 5.590406926891391e-06, + "loss": 2.6003, + "step": 8571500 + }, + { + "epoch": 2.6647310166670035, + "grad_norm": 10.307499885559082, + "learning_rate": 5.5878163888832765e-06, + "loss": 2.5533, + "step": 8572000 + }, + { + "epoch": 2.6648864489474904, + "grad_norm": 9.497299194335938, + "learning_rate": 5.585225850875161e-06, + "loss": 2.5223, + "step": 8572500 + }, + { + "epoch": 2.6650418812279772, + "grad_norm": 8.996347427368164, + "learning_rate": 5.5826353128670465e-06, + "loss": 2.6418, + "step": 8573000 + }, + { + "epoch": 2.665197313508464, + "grad_norm": 11.298912048339844, + "learning_rate": 5.580044774858933e-06, + "loss": 2.5898, + "step": 8573500 + }, + { + "epoch": 2.665352745788951, + "grad_norm": 8.601521492004395, + "learning_rate": 5.577454236850818e-06, + "loss": 2.5988, + "step": 8574000 + }, + { + "epoch": 2.665508178069438, + "grad_norm": 9.749378204345703, + "learning_rate": 5.574863698842704e-06, + "loss": 2.591, + "step": 8574500 + }, + { + "epoch": 2.6656636103499247, + "grad_norm": 19.928922653198242, + "learning_rate": 5.572273160834588e-06, + "loss": 2.6225, + "step": 8575000 + }, + { + "epoch": 2.6658190426304116, + "grad_norm": 9.383539199829102, + "learning_rate": 5.569682622826474e-06, + "loss": 2.6027, + "step": 8575500 + }, + { + "epoch": 2.6659744749108985, + "grad_norm": 9.907903671264648, + "learning_rate": 5.567092084818359e-06, + "loss": 2.5888, + "step": 8576000 + }, + { + "epoch": 2.6661299071913853, + "grad_norm": 15.564483642578125, + "learning_rate": 5.564501546810245e-06, + "loss": 2.5665, + "step": 8576500 + }, + { + "epoch": 2.666285339471872, + "grad_norm": 9.914799690246582, + "learning_rate": 5.56191100880213e-06, + "loss": 2.6026, + "step": 8577000 + }, + { + "epoch": 2.666440771752359, + "grad_norm": 12.492156982421875, + "learning_rate": 5.559320470794016e-06, + "loss": 2.5389, + "step": 8577500 + }, + { + "epoch": 2.666596204032846, + "grad_norm": 10.78565502166748, + "learning_rate": 5.556729932785901e-06, + "loss": 2.6092, + "step": 8578000 + }, + { + "epoch": 2.666751636313333, + "grad_norm": 12.757322311401367, + "learning_rate": 5.554139394777787e-06, + "loss": 2.5913, + "step": 8578500 + }, + { + "epoch": 2.6669070685938197, + "grad_norm": 11.235438346862793, + "learning_rate": 5.551548856769672e-06, + "loss": 2.591, + "step": 8579000 + }, + { + "epoch": 2.6670625008743065, + "grad_norm": 10.758502960205078, + "learning_rate": 5.5489583187615575e-06, + "loss": 2.6064, + "step": 8579500 + }, + { + "epoch": 2.6672179331547934, + "grad_norm": 9.500304222106934, + "learning_rate": 5.546367780753443e-06, + "loss": 2.5643, + "step": 8580000 + }, + { + "epoch": 2.6673733654352803, + "grad_norm": 11.968565940856934, + "learning_rate": 5.543777242745328e-06, + "loss": 2.639, + "step": 8580500 + }, + { + "epoch": 2.667528797715767, + "grad_norm": 9.163712501525879, + "learning_rate": 5.541186704737213e-06, + "loss": 2.5899, + "step": 8581000 + }, + { + "epoch": 2.667684229996254, + "grad_norm": 10.403534889221191, + "learning_rate": 5.5385961667290985e-06, + "loss": 2.5581, + "step": 8581500 + }, + { + "epoch": 2.667839662276741, + "grad_norm": 9.675067901611328, + "learning_rate": 5.536005628720984e-06, + "loss": 2.5491, + "step": 8582000 + }, + { + "epoch": 2.6679950945572277, + "grad_norm": 10.591714859008789, + "learning_rate": 5.53341509071287e-06, + "loss": 2.6156, + "step": 8582500 + }, + { + "epoch": 2.6681505268377146, + "grad_norm": 11.088881492614746, + "learning_rate": 5.530824552704755e-06, + "loss": 2.5996, + "step": 8583000 + }, + { + "epoch": 2.6683059591182015, + "grad_norm": 9.736783027648926, + "learning_rate": 5.52823401469664e-06, + "loss": 2.5821, + "step": 8583500 + }, + { + "epoch": 2.6684613913986883, + "grad_norm": 11.673462867736816, + "learning_rate": 5.525643476688526e-06, + "loss": 2.5484, + "step": 8584000 + }, + { + "epoch": 2.668616823679175, + "grad_norm": 9.39244270324707, + "learning_rate": 5.523052938680411e-06, + "loss": 2.5706, + "step": 8584500 + }, + { + "epoch": 2.668772255959662, + "grad_norm": 8.373669624328613, + "learning_rate": 5.520462400672297e-06, + "loss": 2.5431, + "step": 8585000 + }, + { + "epoch": 2.6689276882401494, + "grad_norm": 10.053236961364746, + "learning_rate": 5.517871862664182e-06, + "loss": 2.61, + "step": 8585500 + }, + { + "epoch": 2.669083120520636, + "grad_norm": 14.617950439453125, + "learning_rate": 5.515281324656068e-06, + "loss": 2.569, + "step": 8586000 + }, + { + "epoch": 2.669238552801123, + "grad_norm": 26.925817489624023, + "learning_rate": 5.512690786647952e-06, + "loss": 2.5903, + "step": 8586500 + }, + { + "epoch": 2.6693939850816095, + "grad_norm": 43.86872100830078, + "learning_rate": 5.5101002486398386e-06, + "loss": 2.6032, + "step": 8587000 + }, + { + "epoch": 2.669549417362097, + "grad_norm": 11.302499771118164, + "learning_rate": 5.507509710631724e-06, + "loss": 2.5866, + "step": 8587500 + }, + { + "epoch": 2.6697048496425833, + "grad_norm": 10.887476921081543, + "learning_rate": 5.5049191726236095e-06, + "loss": 2.5853, + "step": 8588000 + }, + { + "epoch": 2.6698602819230706, + "grad_norm": 9.24781608581543, + "learning_rate": 5.502328634615495e-06, + "loss": 2.6161, + "step": 8588500 + }, + { + "epoch": 2.670015714203557, + "grad_norm": 11.761858940124512, + "learning_rate": 5.4997380966073795e-06, + "loss": 2.6135, + "step": 8589000 + }, + { + "epoch": 2.6701711464840443, + "grad_norm": 20.42721939086914, + "learning_rate": 5.497147558599265e-06, + "loss": 2.6088, + "step": 8589500 + }, + { + "epoch": 2.6703265787645307, + "grad_norm": 9.943365097045898, + "learning_rate": 5.4945570205911505e-06, + "loss": 2.5491, + "step": 8590000 + }, + { + "epoch": 2.670482011045018, + "grad_norm": 10.424254417419434, + "learning_rate": 5.491966482583036e-06, + "loss": 2.5378, + "step": 8590500 + }, + { + "epoch": 2.6706374433255045, + "grad_norm": 10.98985767364502, + "learning_rate": 5.489375944574921e-06, + "loss": 2.6086, + "step": 8591000 + }, + { + "epoch": 2.6707928756059918, + "grad_norm": 9.008504867553711, + "learning_rate": 5.486785406566807e-06, + "loss": 2.5758, + "step": 8591500 + }, + { + "epoch": 2.6709483078864786, + "grad_norm": 9.09701156616211, + "learning_rate": 5.484194868558692e-06, + "loss": 2.5712, + "step": 8592000 + }, + { + "epoch": 2.6711037401669655, + "grad_norm": 25.428194046020508, + "learning_rate": 5.481604330550578e-06, + "loss": 2.5681, + "step": 8592500 + }, + { + "epoch": 2.6712591724474524, + "grad_norm": 9.105399131774902, + "learning_rate": 5.479013792542463e-06, + "loss": 2.6005, + "step": 8593000 + }, + { + "epoch": 2.6714146047279392, + "grad_norm": 9.761425971984863, + "learning_rate": 5.476423254534349e-06, + "loss": 2.5913, + "step": 8593500 + }, + { + "epoch": 2.671570037008426, + "grad_norm": 8.664178848266602, + "learning_rate": 5.473832716526234e-06, + "loss": 2.6315, + "step": 8594000 + }, + { + "epoch": 2.671725469288913, + "grad_norm": 11.85086441040039, + "learning_rate": 5.471242178518119e-06, + "loss": 2.5805, + "step": 8594500 + }, + { + "epoch": 2.6718809015694, + "grad_norm": 13.358564376831055, + "learning_rate": 5.468651640510004e-06, + "loss": 2.5921, + "step": 8595000 + }, + { + "epoch": 2.6720363338498867, + "grad_norm": 7.996952056884766, + "learning_rate": 5.46606110250189e-06, + "loss": 2.6006, + "step": 8595500 + }, + { + "epoch": 2.6721917661303736, + "grad_norm": 10.289213180541992, + "learning_rate": 5.463470564493776e-06, + "loss": 2.5552, + "step": 8596000 + }, + { + "epoch": 2.6723471984108604, + "grad_norm": 15.259156227111816, + "learning_rate": 5.4608800264856614e-06, + "loss": 2.5832, + "step": 8596500 + }, + { + "epoch": 2.6725026306913473, + "grad_norm": 10.704061508178711, + "learning_rate": 5.458289488477546e-06, + "loss": 2.5882, + "step": 8597000 + }, + { + "epoch": 2.672658062971834, + "grad_norm": 9.318482398986816, + "learning_rate": 5.4556989504694315e-06, + "loss": 2.5693, + "step": 8597500 + }, + { + "epoch": 2.672813495252321, + "grad_norm": 9.984946250915527, + "learning_rate": 5.453108412461317e-06, + "loss": 2.5756, + "step": 8598000 + }, + { + "epoch": 2.672968927532808, + "grad_norm": 19.409934997558594, + "learning_rate": 5.4505178744532024e-06, + "loss": 2.6299, + "step": 8598500 + }, + { + "epoch": 2.6731243598132948, + "grad_norm": 12.344781875610352, + "learning_rate": 5.447927336445088e-06, + "loss": 2.6149, + "step": 8599000 + }, + { + "epoch": 2.6732797920937816, + "grad_norm": 12.965024948120117, + "learning_rate": 5.445336798436973e-06, + "loss": 2.5529, + "step": 8599500 + }, + { + "epoch": 2.6734352243742685, + "grad_norm": 10.774566650390625, + "learning_rate": 5.442746260428859e-06, + "loss": 2.5848, + "step": 8600000 + }, + { + "epoch": 2.6735906566547554, + "grad_norm": 9.623488426208496, + "learning_rate": 5.440155722420744e-06, + "loss": 2.6201, + "step": 8600500 + }, + { + "epoch": 2.6737460889352422, + "grad_norm": 10.638114929199219, + "learning_rate": 5.43756518441263e-06, + "loss": 2.5497, + "step": 8601000 + }, + { + "epoch": 2.673901521215729, + "grad_norm": 22.71841812133789, + "learning_rate": 5.434974646404515e-06, + "loss": 2.6013, + "step": 8601500 + }, + { + "epoch": 2.674056953496216, + "grad_norm": 19.04287338256836, + "learning_rate": 5.432384108396401e-06, + "loss": 2.5843, + "step": 8602000 + }, + { + "epoch": 2.674212385776703, + "grad_norm": 9.090229034423828, + "learning_rate": 5.429793570388285e-06, + "loss": 2.5533, + "step": 8602500 + }, + { + "epoch": 2.6743678180571897, + "grad_norm": 20.82131576538086, + "learning_rate": 5.427203032380171e-06, + "loss": 2.5733, + "step": 8603000 + }, + { + "epoch": 2.6745232503376766, + "grad_norm": 10.430685997009277, + "learning_rate": 5.424612494372056e-06, + "loss": 2.5936, + "step": 8603500 + }, + { + "epoch": 2.6746786826181634, + "grad_norm": 19.93903160095215, + "learning_rate": 5.422021956363942e-06, + "loss": 2.5851, + "step": 8604000 + }, + { + "epoch": 2.6748341148986503, + "grad_norm": 11.284131050109863, + "learning_rate": 5.419431418355828e-06, + "loss": 2.5865, + "step": 8604500 + }, + { + "epoch": 2.674989547179137, + "grad_norm": 8.64222240447998, + "learning_rate": 5.4168408803477126e-06, + "loss": 2.6215, + "step": 8605000 + }, + { + "epoch": 2.675144979459624, + "grad_norm": 10.237725257873535, + "learning_rate": 5.414250342339598e-06, + "loss": 2.5883, + "step": 8605500 + }, + { + "epoch": 2.675300411740111, + "grad_norm": 38.15378952026367, + "learning_rate": 5.4116598043314835e-06, + "loss": 2.5846, + "step": 8606000 + }, + { + "epoch": 2.6754558440205978, + "grad_norm": 10.777291297912598, + "learning_rate": 5.409069266323369e-06, + "loss": 2.5918, + "step": 8606500 + }, + { + "epoch": 2.6756112763010846, + "grad_norm": 9.694234848022461, + "learning_rate": 5.406478728315254e-06, + "loss": 2.5947, + "step": 8607000 + }, + { + "epoch": 2.6757667085815715, + "grad_norm": 13.788326263427734, + "learning_rate": 5.40388819030714e-06, + "loss": 2.5973, + "step": 8607500 + }, + { + "epoch": 2.6759221408620584, + "grad_norm": 12.421503067016602, + "learning_rate": 5.4012976522990245e-06, + "loss": 2.5849, + "step": 8608000 + }, + { + "epoch": 2.6760775731425452, + "grad_norm": 11.312088966369629, + "learning_rate": 5.39870711429091e-06, + "loss": 2.6174, + "step": 8608500 + }, + { + "epoch": 2.676233005423032, + "grad_norm": 9.296019554138184, + "learning_rate": 5.396116576282796e-06, + "loss": 2.5564, + "step": 8609000 + }, + { + "epoch": 2.676388437703519, + "grad_norm": 10.770111083984375, + "learning_rate": 5.393526038274682e-06, + "loss": 2.6059, + "step": 8609500 + }, + { + "epoch": 2.6765438699840063, + "grad_norm": 11.51449966430664, + "learning_rate": 5.390935500266567e-06, + "loss": 2.5518, + "step": 8610000 + }, + { + "epoch": 2.6766993022644927, + "grad_norm": 10.663434982299805, + "learning_rate": 5.388344962258452e-06, + "loss": 2.6103, + "step": 8610500 + }, + { + "epoch": 2.67685473454498, + "grad_norm": 10.50610637664795, + "learning_rate": 5.385754424250337e-06, + "loss": 2.6026, + "step": 8611000 + }, + { + "epoch": 2.6770101668254664, + "grad_norm": 9.585569381713867, + "learning_rate": 5.383163886242223e-06, + "loss": 2.6026, + "step": 8611500 + }, + { + "epoch": 2.6771655991059538, + "grad_norm": 10.103202819824219, + "learning_rate": 5.380573348234108e-06, + "loss": 2.6178, + "step": 8612000 + }, + { + "epoch": 2.67732103138644, + "grad_norm": 10.308722496032715, + "learning_rate": 5.377982810225994e-06, + "loss": 2.5862, + "step": 8612500 + }, + { + "epoch": 2.6774764636669275, + "grad_norm": 15.940855979919434, + "learning_rate": 5.375392272217879e-06, + "loss": 2.5972, + "step": 8613000 + }, + { + "epoch": 2.677631895947414, + "grad_norm": 17.799846649169922, + "learning_rate": 5.3728017342097645e-06, + "loss": 2.5995, + "step": 8613500 + }, + { + "epoch": 2.677787328227901, + "grad_norm": 10.205463409423828, + "learning_rate": 5.37021119620165e-06, + "loss": 2.5368, + "step": 8614000 + }, + { + "epoch": 2.6779427605083876, + "grad_norm": 10.249804496765137, + "learning_rate": 5.3676206581935354e-06, + "loss": 2.5725, + "step": 8614500 + }, + { + "epoch": 2.678098192788875, + "grad_norm": 14.112874984741211, + "learning_rate": 5.365030120185421e-06, + "loss": 2.582, + "step": 8615000 + }, + { + "epoch": 2.678253625069362, + "grad_norm": 9.855664253234863, + "learning_rate": 5.362439582177306e-06, + "loss": 2.5994, + "step": 8615500 + }, + { + "epoch": 2.6784090573498487, + "grad_norm": 11.31820011138916, + "learning_rate": 5.359849044169191e-06, + "loss": 2.5901, + "step": 8616000 + }, + { + "epoch": 2.6785644896303356, + "grad_norm": 29.400407791137695, + "learning_rate": 5.357258506161076e-06, + "loss": 2.6021, + "step": 8616500 + }, + { + "epoch": 2.6787199219108224, + "grad_norm": 6.511735916137695, + "learning_rate": 5.354667968152962e-06, + "loss": 2.5663, + "step": 8617000 + }, + { + "epoch": 2.6788753541913093, + "grad_norm": 11.230974197387695, + "learning_rate": 5.352077430144847e-06, + "loss": 2.5952, + "step": 8617500 + }, + { + "epoch": 2.679030786471796, + "grad_norm": 9.370247840881348, + "learning_rate": 5.349486892136734e-06, + "loss": 2.6086, + "step": 8618000 + }, + { + "epoch": 2.679186218752283, + "grad_norm": 9.80864429473877, + "learning_rate": 5.346896354128618e-06, + "loss": 2.5759, + "step": 8618500 + }, + { + "epoch": 2.67934165103277, + "grad_norm": 10.509843826293945, + "learning_rate": 5.344305816120504e-06, + "loss": 2.5904, + "step": 8619000 + }, + { + "epoch": 2.6794970833132568, + "grad_norm": 8.103572845458984, + "learning_rate": 5.341715278112389e-06, + "loss": 2.5594, + "step": 8619500 + }, + { + "epoch": 2.6796525155937436, + "grad_norm": 9.779763221740723, + "learning_rate": 5.339124740104275e-06, + "loss": 2.5537, + "step": 8620000 + }, + { + "epoch": 2.6798079478742305, + "grad_norm": 10.253069877624512, + "learning_rate": 5.33653420209616e-06, + "loss": 2.563, + "step": 8620500 + }, + { + "epoch": 2.6799633801547174, + "grad_norm": 10.59827709197998, + "learning_rate": 5.3339436640880456e-06, + "loss": 2.5684, + "step": 8621000 + }, + { + "epoch": 2.6801188124352042, + "grad_norm": 11.163259506225586, + "learning_rate": 5.33135312607993e-06, + "loss": 2.5936, + "step": 8621500 + }, + { + "epoch": 2.680274244715691, + "grad_norm": 8.5137300491333, + "learning_rate": 5.328762588071816e-06, + "loss": 2.5878, + "step": 8622000 + }, + { + "epoch": 2.680429676996178, + "grad_norm": 11.612781524658203, + "learning_rate": 5.326172050063702e-06, + "loss": 2.5681, + "step": 8622500 + }, + { + "epoch": 2.680585109276665, + "grad_norm": 8.738204956054688, + "learning_rate": 5.323581512055587e-06, + "loss": 2.548, + "step": 8623000 + }, + { + "epoch": 2.6807405415571517, + "grad_norm": 9.093335151672363, + "learning_rate": 5.320990974047473e-06, + "loss": 2.5832, + "step": 8623500 + }, + { + "epoch": 2.6808959738376386, + "grad_norm": 12.092567443847656, + "learning_rate": 5.3184004360393575e-06, + "loss": 2.579, + "step": 8624000 + }, + { + "epoch": 2.6810514061181254, + "grad_norm": 14.795232772827148, + "learning_rate": 5.315809898031243e-06, + "loss": 2.5343, + "step": 8624500 + }, + { + "epoch": 2.6812068383986123, + "grad_norm": 8.855393409729004, + "learning_rate": 5.313219360023128e-06, + "loss": 2.5907, + "step": 8625000 + }, + { + "epoch": 2.681362270679099, + "grad_norm": 11.940059661865234, + "learning_rate": 5.310628822015014e-06, + "loss": 2.6106, + "step": 8625500 + }, + { + "epoch": 2.681517702959586, + "grad_norm": 9.306549072265625, + "learning_rate": 5.308038284006899e-06, + "loss": 2.5652, + "step": 8626000 + }, + { + "epoch": 2.681673135240073, + "grad_norm": 9.100332260131836, + "learning_rate": 5.305447745998785e-06, + "loss": 2.626, + "step": 8626500 + }, + { + "epoch": 2.6818285675205598, + "grad_norm": 10.637104034423828, + "learning_rate": 5.30285720799067e-06, + "loss": 2.5554, + "step": 8627000 + }, + { + "epoch": 2.6819839998010466, + "grad_norm": 8.789618492126465, + "learning_rate": 5.300266669982556e-06, + "loss": 2.6136, + "step": 8627500 + }, + { + "epoch": 2.6821394320815335, + "grad_norm": 11.921761512756348, + "learning_rate": 5.297676131974441e-06, + "loss": 2.6002, + "step": 8628000 + }, + { + "epoch": 2.6822948643620204, + "grad_norm": 10.124332427978516, + "learning_rate": 5.295085593966327e-06, + "loss": 2.6043, + "step": 8628500 + }, + { + "epoch": 2.6824502966425072, + "grad_norm": 10.302042961120605, + "learning_rate": 5.292495055958212e-06, + "loss": 2.5429, + "step": 8629000 + }, + { + "epoch": 2.682605728922994, + "grad_norm": 11.63502025604248, + "learning_rate": 5.289904517950097e-06, + "loss": 2.5596, + "step": 8629500 + }, + { + "epoch": 2.682761161203481, + "grad_norm": 79.91144561767578, + "learning_rate": 5.287313979941982e-06, + "loss": 2.5768, + "step": 8630000 + }, + { + "epoch": 2.682916593483968, + "grad_norm": 11.481793403625488, + "learning_rate": 5.284723441933868e-06, + "loss": 2.565, + "step": 8630500 + }, + { + "epoch": 2.6830720257644547, + "grad_norm": 13.261563301086426, + "learning_rate": 5.282132903925753e-06, + "loss": 2.5746, + "step": 8631000 + }, + { + "epoch": 2.6832274580449416, + "grad_norm": 12.54362964630127, + "learning_rate": 5.279542365917639e-06, + "loss": 2.598, + "step": 8631500 + }, + { + "epoch": 2.6833828903254284, + "grad_norm": 6.260794639587402, + "learning_rate": 5.276951827909524e-06, + "loss": 2.5672, + "step": 8632000 + }, + { + "epoch": 2.6835383226059153, + "grad_norm": 10.181584358215332, + "learning_rate": 5.2743612899014094e-06, + "loss": 2.5882, + "step": 8632500 + }, + { + "epoch": 2.683693754886402, + "grad_norm": 11.838445663452148, + "learning_rate": 5.271770751893295e-06, + "loss": 2.5675, + "step": 8633000 + }, + { + "epoch": 2.683849187166889, + "grad_norm": 7.882524490356445, + "learning_rate": 5.26918021388518e-06, + "loss": 2.5566, + "step": 8633500 + }, + { + "epoch": 2.684004619447376, + "grad_norm": 12.104247093200684, + "learning_rate": 5.266589675877066e-06, + "loss": 2.5587, + "step": 8634000 + }, + { + "epoch": 2.684160051727863, + "grad_norm": 10.73119068145752, + "learning_rate": 5.263999137868951e-06, + "loss": 2.5819, + "step": 8634500 + }, + { + "epoch": 2.6843154840083496, + "grad_norm": 16.185386657714844, + "learning_rate": 5.261408599860837e-06, + "loss": 2.6076, + "step": 8635000 + }, + { + "epoch": 2.684470916288837, + "grad_norm": 9.475403785705566, + "learning_rate": 5.258818061852721e-06, + "loss": 2.5737, + "step": 8635500 + }, + { + "epoch": 2.6846263485693234, + "grad_norm": 13.372493743896484, + "learning_rate": 5.256227523844608e-06, + "loss": 2.5886, + "step": 8636000 + }, + { + "epoch": 2.6847817808498107, + "grad_norm": 9.192715644836426, + "learning_rate": 5.253636985836493e-06, + "loss": 2.6134, + "step": 8636500 + }, + { + "epoch": 2.684937213130297, + "grad_norm": 7.544852256774902, + "learning_rate": 5.2510464478283786e-06, + "loss": 2.5785, + "step": 8637000 + }, + { + "epoch": 2.6850926454107844, + "grad_norm": 9.437175750732422, + "learning_rate": 5.248455909820264e-06, + "loss": 2.5803, + "step": 8637500 + }, + { + "epoch": 2.685248077691271, + "grad_norm": 10.922019004821777, + "learning_rate": 5.245865371812149e-06, + "loss": 2.5796, + "step": 8638000 + }, + { + "epoch": 2.685403509971758, + "grad_norm": 8.670550346374512, + "learning_rate": 5.243274833804034e-06, + "loss": 2.6092, + "step": 8638500 + }, + { + "epoch": 2.6855589422522446, + "grad_norm": 8.356266021728516, + "learning_rate": 5.2406842957959195e-06, + "loss": 2.5853, + "step": 8639000 + }, + { + "epoch": 2.685714374532732, + "grad_norm": 9.363541603088379, + "learning_rate": 5.238093757787805e-06, + "loss": 2.6244, + "step": 8639500 + }, + { + "epoch": 2.6858698068132187, + "grad_norm": 10.342141151428223, + "learning_rate": 5.2355032197796905e-06, + "loss": 2.6032, + "step": 8640000 + }, + { + "epoch": 2.6860252390937056, + "grad_norm": 10.980114936828613, + "learning_rate": 5.232912681771576e-06, + "loss": 2.5895, + "step": 8640500 + }, + { + "epoch": 2.6861806713741925, + "grad_norm": 10.833930015563965, + "learning_rate": 5.230322143763461e-06, + "loss": 2.5887, + "step": 8641000 + }, + { + "epoch": 2.6863361036546793, + "grad_norm": 12.392942428588867, + "learning_rate": 5.227731605755347e-06, + "loss": 2.5774, + "step": 8641500 + }, + { + "epoch": 2.686491535935166, + "grad_norm": 6.391676902770996, + "learning_rate": 5.225141067747232e-06, + "loss": 2.6116, + "step": 8642000 + }, + { + "epoch": 2.686646968215653, + "grad_norm": 10.826515197753906, + "learning_rate": 5.222550529739118e-06, + "loss": 2.5166, + "step": 8642500 + }, + { + "epoch": 2.68680240049614, + "grad_norm": 7.463301181793213, + "learning_rate": 5.219959991731003e-06, + "loss": 2.6107, + "step": 8643000 + }, + { + "epoch": 2.686957832776627, + "grad_norm": 10.03957748413086, + "learning_rate": 5.217369453722888e-06, + "loss": 2.6017, + "step": 8643500 + }, + { + "epoch": 2.6871132650571137, + "grad_norm": 10.807025909423828, + "learning_rate": 5.214778915714773e-06, + "loss": 2.5876, + "step": 8644000 + }, + { + "epoch": 2.6872686973376005, + "grad_norm": 10.668158531188965, + "learning_rate": 5.212188377706659e-06, + "loss": 2.573, + "step": 8644500 + }, + { + "epoch": 2.6874241296180874, + "grad_norm": 9.667227745056152, + "learning_rate": 5.209597839698545e-06, + "loss": 2.6497, + "step": 8645000 + }, + { + "epoch": 2.6875795618985743, + "grad_norm": 11.944743156433105, + "learning_rate": 5.2070073016904305e-06, + "loss": 2.6064, + "step": 8645500 + }, + { + "epoch": 2.687734994179061, + "grad_norm": 8.857224464416504, + "learning_rate": 5.204416763682315e-06, + "loss": 2.5847, + "step": 8646000 + }, + { + "epoch": 2.687890426459548, + "grad_norm": 10.534516334533691, + "learning_rate": 5.201826225674201e-06, + "loss": 2.5882, + "step": 8646500 + }, + { + "epoch": 2.688045858740035, + "grad_norm": 14.364018440246582, + "learning_rate": 5.199235687666086e-06, + "loss": 2.5848, + "step": 8647000 + }, + { + "epoch": 2.6882012910205217, + "grad_norm": 12.136750221252441, + "learning_rate": 5.1966451496579715e-06, + "loss": 2.5679, + "step": 8647500 + }, + { + "epoch": 2.6883567233010086, + "grad_norm": 10.787906646728516, + "learning_rate": 5.194054611649857e-06, + "loss": 2.5959, + "step": 8648000 + }, + { + "epoch": 2.6885121555814955, + "grad_norm": 11.750195503234863, + "learning_rate": 5.1914640736417424e-06, + "loss": 2.6155, + "step": 8648500 + }, + { + "epoch": 2.6886675878619823, + "grad_norm": 7.665991306304932, + "learning_rate": 5.188873535633627e-06, + "loss": 2.5671, + "step": 8649000 + }, + { + "epoch": 2.688823020142469, + "grad_norm": 8.669679641723633, + "learning_rate": 5.186282997625513e-06, + "loss": 2.6111, + "step": 8649500 + }, + { + "epoch": 2.688978452422956, + "grad_norm": 38.23857498168945, + "learning_rate": 5.183692459617399e-06, + "loss": 2.5373, + "step": 8650000 + }, + { + "epoch": 2.689133884703443, + "grad_norm": 21.715560913085938, + "learning_rate": 5.181101921609284e-06, + "loss": 2.5801, + "step": 8650500 + }, + { + "epoch": 2.68928931698393, + "grad_norm": 19.028566360473633, + "learning_rate": 5.17851138360117e-06, + "loss": 2.604, + "step": 8651000 + }, + { + "epoch": 2.6894447492644167, + "grad_norm": 9.507673263549805, + "learning_rate": 5.175920845593054e-06, + "loss": 2.5826, + "step": 8651500 + }, + { + "epoch": 2.6896001815449035, + "grad_norm": 10.12279224395752, + "learning_rate": 5.17333030758494e-06, + "loss": 2.5115, + "step": 8652000 + }, + { + "epoch": 2.6897556138253904, + "grad_norm": 13.723429679870605, + "learning_rate": 5.170739769576825e-06, + "loss": 2.6095, + "step": 8652500 + }, + { + "epoch": 2.6899110461058773, + "grad_norm": 9.339059829711914, + "learning_rate": 5.168149231568711e-06, + "loss": 2.5986, + "step": 8653000 + }, + { + "epoch": 2.690066478386364, + "grad_norm": 9.67098617553711, + "learning_rate": 5.165558693560596e-06, + "loss": 2.5835, + "step": 8653500 + }, + { + "epoch": 2.690221910666851, + "grad_norm": 17.265844345092773, + "learning_rate": 5.162968155552482e-06, + "loss": 2.5772, + "step": 8654000 + }, + { + "epoch": 2.690377342947338, + "grad_norm": 9.372440338134766, + "learning_rate": 5.160377617544367e-06, + "loss": 2.5641, + "step": 8654500 + }, + { + "epoch": 2.6905327752278247, + "grad_norm": 11.635621070861816, + "learning_rate": 5.1577870795362526e-06, + "loss": 2.5515, + "step": 8655000 + }, + { + "epoch": 2.6906882075083116, + "grad_norm": 10.464176177978516, + "learning_rate": 5.155196541528138e-06, + "loss": 2.6018, + "step": 8655500 + }, + { + "epoch": 2.6908436397887985, + "grad_norm": 11.41054916381836, + "learning_rate": 5.1526060035200235e-06, + "loss": 2.5446, + "step": 8656000 + }, + { + "epoch": 2.6909990720692853, + "grad_norm": 10.67934513092041, + "learning_rate": 5.150015465511909e-06, + "loss": 2.5773, + "step": 8656500 + }, + { + "epoch": 2.691154504349772, + "grad_norm": 10.13460922241211, + "learning_rate": 5.1474249275037935e-06, + "loss": 2.6267, + "step": 8657000 + }, + { + "epoch": 2.691309936630259, + "grad_norm": 10.315860748291016, + "learning_rate": 5.144834389495679e-06, + "loss": 2.6349, + "step": 8657500 + }, + { + "epoch": 2.6914653689107464, + "grad_norm": 6.477080821990967, + "learning_rate": 5.1422438514875645e-06, + "loss": 2.5967, + "step": 8658000 + }, + { + "epoch": 2.691620801191233, + "grad_norm": 12.228296279907227, + "learning_rate": 5.139653313479451e-06, + "loss": 2.6244, + "step": 8658500 + }, + { + "epoch": 2.69177623347172, + "grad_norm": 13.843673706054688, + "learning_rate": 5.137062775471336e-06, + "loss": 2.6077, + "step": 8659000 + }, + { + "epoch": 2.6919316657522065, + "grad_norm": 10.239714622497559, + "learning_rate": 5.134472237463221e-06, + "loss": 2.5835, + "step": 8659500 + }, + { + "epoch": 2.692087098032694, + "grad_norm": 12.276991844177246, + "learning_rate": 5.131881699455106e-06, + "loss": 2.6314, + "step": 8660000 + }, + { + "epoch": 2.6922425303131803, + "grad_norm": 9.558337211608887, + "learning_rate": 5.129291161446992e-06, + "loss": 2.604, + "step": 8660500 + }, + { + "epoch": 2.6923979625936676, + "grad_norm": 11.331816673278809, + "learning_rate": 5.126700623438877e-06, + "loss": 2.5972, + "step": 8661000 + }, + { + "epoch": 2.692553394874154, + "grad_norm": 9.267719268798828, + "learning_rate": 5.124110085430763e-06, + "loss": 2.5596, + "step": 8661500 + }, + { + "epoch": 2.6927088271546413, + "grad_norm": 20.24561882019043, + "learning_rate": 5.121519547422648e-06, + "loss": 2.5488, + "step": 8662000 + }, + { + "epoch": 2.6928642594351277, + "grad_norm": 12.159795761108398, + "learning_rate": 5.118929009414533e-06, + "loss": 2.5555, + "step": 8662500 + }, + { + "epoch": 2.693019691715615, + "grad_norm": 10.259662628173828, + "learning_rate": 5.116338471406419e-06, + "loss": 2.5748, + "step": 8663000 + }, + { + "epoch": 2.6931751239961015, + "grad_norm": 6.86586856842041, + "learning_rate": 5.1137479333983045e-06, + "loss": 2.5616, + "step": 8663500 + }, + { + "epoch": 2.693330556276589, + "grad_norm": 10.114288330078125, + "learning_rate": 5.11115739539019e-06, + "loss": 2.5823, + "step": 8664000 + }, + { + "epoch": 2.6934859885570757, + "grad_norm": 10.870023727416992, + "learning_rate": 5.1085668573820754e-06, + "loss": 2.5749, + "step": 8664500 + }, + { + "epoch": 2.6936414208375625, + "grad_norm": 11.468902587890625, + "learning_rate": 5.10597631937396e-06, + "loss": 2.5689, + "step": 8665000 + }, + { + "epoch": 2.6937968531180494, + "grad_norm": 9.897089004516602, + "learning_rate": 5.1033857813658455e-06, + "loss": 2.5802, + "step": 8665500 + }, + { + "epoch": 2.6939522853985363, + "grad_norm": 16.45501708984375, + "learning_rate": 5.100795243357731e-06, + "loss": 2.5616, + "step": 8666000 + }, + { + "epoch": 2.694107717679023, + "grad_norm": 14.355603218078613, + "learning_rate": 5.098204705349616e-06, + "loss": 2.5409, + "step": 8666500 + }, + { + "epoch": 2.69426314995951, + "grad_norm": 10.310498237609863, + "learning_rate": 5.095614167341503e-06, + "loss": 2.5962, + "step": 8667000 + }, + { + "epoch": 2.694418582239997, + "grad_norm": 16.90106773376465, + "learning_rate": 5.093023629333387e-06, + "loss": 2.5665, + "step": 8667500 + }, + { + "epoch": 2.6945740145204837, + "grad_norm": 8.67263126373291, + "learning_rate": 5.090433091325273e-06, + "loss": 2.5433, + "step": 8668000 + }, + { + "epoch": 2.6947294468009706, + "grad_norm": 10.569586753845215, + "learning_rate": 5.087842553317158e-06, + "loss": 2.5676, + "step": 8668500 + }, + { + "epoch": 2.6948848790814575, + "grad_norm": 9.186983108520508, + "learning_rate": 5.085252015309044e-06, + "loss": 2.6291, + "step": 8669000 + }, + { + "epoch": 2.6950403113619443, + "grad_norm": 8.714902877807617, + "learning_rate": 5.082661477300929e-06, + "loss": 2.5625, + "step": 8669500 + }, + { + "epoch": 2.695195743642431, + "grad_norm": 6.947915554046631, + "learning_rate": 5.080070939292815e-06, + "loss": 2.5813, + "step": 8670000 + }, + { + "epoch": 2.695351175922918, + "grad_norm": 9.67037296295166, + "learning_rate": 5.077480401284699e-06, + "loss": 2.5952, + "step": 8670500 + }, + { + "epoch": 2.695506608203405, + "grad_norm": 8.66799545288086, + "learning_rate": 5.074889863276585e-06, + "loss": 2.6125, + "step": 8671000 + }, + { + "epoch": 2.695662040483892, + "grad_norm": 11.702439308166504, + "learning_rate": 5.072299325268471e-06, + "loss": 2.5383, + "step": 8671500 + }, + { + "epoch": 2.6958174727643787, + "grad_norm": 10.409191131591797, + "learning_rate": 5.0697087872603565e-06, + "loss": 2.5441, + "step": 8672000 + }, + { + "epoch": 2.6959729050448655, + "grad_norm": 18.34592056274414, + "learning_rate": 5.067118249252242e-06, + "loss": 2.5874, + "step": 8672500 + }, + { + "epoch": 2.6961283373253524, + "grad_norm": 19.181676864624023, + "learning_rate": 5.0645277112441265e-06, + "loss": 2.5541, + "step": 8673000 + }, + { + "epoch": 2.6962837696058393, + "grad_norm": 10.2540283203125, + "learning_rate": 5.061937173236012e-06, + "loss": 2.5666, + "step": 8673500 + }, + { + "epoch": 2.696439201886326, + "grad_norm": 8.581040382385254, + "learning_rate": 5.0593466352278975e-06, + "loss": 2.5587, + "step": 8674000 + }, + { + "epoch": 2.696594634166813, + "grad_norm": 9.861961364746094, + "learning_rate": 5.056756097219783e-06, + "loss": 2.5999, + "step": 8674500 + }, + { + "epoch": 2.6967500664473, + "grad_norm": 11.52578067779541, + "learning_rate": 5.054165559211668e-06, + "loss": 2.5, + "step": 8675000 + }, + { + "epoch": 2.6969054987277867, + "grad_norm": 8.035163879394531, + "learning_rate": 5.051575021203554e-06, + "loss": 2.6149, + "step": 8675500 + }, + { + "epoch": 2.6970609310082736, + "grad_norm": 10.127237319946289, + "learning_rate": 5.048984483195439e-06, + "loss": 2.5735, + "step": 8676000 + }, + { + "epoch": 2.6972163632887605, + "grad_norm": 11.329395294189453, + "learning_rate": 5.046393945187325e-06, + "loss": 2.5932, + "step": 8676500 + }, + { + "epoch": 2.6973717955692473, + "grad_norm": 17.500112533569336, + "learning_rate": 5.04380340717921e-06, + "loss": 2.5842, + "step": 8677000 + }, + { + "epoch": 2.697527227849734, + "grad_norm": 8.85267448425293, + "learning_rate": 5.041212869171096e-06, + "loss": 2.5904, + "step": 8677500 + }, + { + "epoch": 2.697682660130221, + "grad_norm": 9.407855987548828, + "learning_rate": 5.038622331162981e-06, + "loss": 2.592, + "step": 8678000 + }, + { + "epoch": 2.697838092410708, + "grad_norm": 16.85112953186035, + "learning_rate": 5.036031793154866e-06, + "loss": 2.5903, + "step": 8678500 + }, + { + "epoch": 2.697993524691195, + "grad_norm": 9.558696746826172, + "learning_rate": 5.033441255146751e-06, + "loss": 2.5853, + "step": 8679000 + }, + { + "epoch": 2.6981489569716817, + "grad_norm": 10.506221771240234, + "learning_rate": 5.030850717138637e-06, + "loss": 2.5648, + "step": 8679500 + }, + { + "epoch": 2.6983043892521685, + "grad_norm": 7.322010517120361, + "learning_rate": 5.028260179130522e-06, + "loss": 2.5427, + "step": 8680000 + }, + { + "epoch": 2.6984598215326554, + "grad_norm": 9.59787368774414, + "learning_rate": 5.0256696411224084e-06, + "loss": 2.5753, + "step": 8680500 + }, + { + "epoch": 2.6986152538131423, + "grad_norm": 182.68080139160156, + "learning_rate": 5.023079103114294e-06, + "loss": 2.5624, + "step": 8681000 + }, + { + "epoch": 2.698770686093629, + "grad_norm": 11.47939682006836, + "learning_rate": 5.0204885651061785e-06, + "loss": 2.5766, + "step": 8681500 + }, + { + "epoch": 2.698926118374116, + "grad_norm": 12.719025611877441, + "learning_rate": 5.017898027098064e-06, + "loss": 2.5533, + "step": 8682000 + }, + { + "epoch": 2.6990815506546033, + "grad_norm": 9.012502670288086, + "learning_rate": 5.0153074890899494e-06, + "loss": 2.6077, + "step": 8682500 + }, + { + "epoch": 2.6992369829350897, + "grad_norm": 10.840422630310059, + "learning_rate": 5.012716951081835e-06, + "loss": 2.5908, + "step": 8683000 + }, + { + "epoch": 2.699392415215577, + "grad_norm": 9.827239036560059, + "learning_rate": 5.01012641307372e-06, + "loss": 2.5663, + "step": 8683500 + }, + { + "epoch": 2.6995478474960635, + "grad_norm": 7.829887866973877, + "learning_rate": 5.007535875065606e-06, + "loss": 2.5742, + "step": 8684000 + }, + { + "epoch": 2.699703279776551, + "grad_norm": 12.148933410644531, + "learning_rate": 5.00494533705749e-06, + "loss": 2.5922, + "step": 8684500 + }, + { + "epoch": 2.699858712057037, + "grad_norm": 9.99728012084961, + "learning_rate": 5.002354799049377e-06, + "loss": 2.5446, + "step": 8685000 + }, + { + "epoch": 2.7000141443375245, + "grad_norm": 12.46465015411377, + "learning_rate": 4.999764261041262e-06, + "loss": 2.561, + "step": 8685500 + }, + { + "epoch": 2.700169576618011, + "grad_norm": 8.634017944335938, + "learning_rate": 4.997173723033148e-06, + "loss": 2.6063, + "step": 8686000 + }, + { + "epoch": 2.7003250088984982, + "grad_norm": 12.14361572265625, + "learning_rate": 4.994583185025033e-06, + "loss": 2.6024, + "step": 8686500 + }, + { + "epoch": 2.7004804411789847, + "grad_norm": 10.697999954223633, + "learning_rate": 4.991992647016918e-06, + "loss": 2.6124, + "step": 8687000 + }, + { + "epoch": 2.700635873459472, + "grad_norm": 10.455337524414062, + "learning_rate": 4.989402109008803e-06, + "loss": 2.5846, + "step": 8687500 + }, + { + "epoch": 2.700791305739959, + "grad_norm": 13.274442672729492, + "learning_rate": 4.986811571000689e-06, + "loss": 2.6062, + "step": 8688000 + }, + { + "epoch": 2.7009467380204457, + "grad_norm": 8.38619327545166, + "learning_rate": 4.984221032992574e-06, + "loss": 2.56, + "step": 8688500 + }, + { + "epoch": 2.7011021703009326, + "grad_norm": 12.798860549926758, + "learning_rate": 4.9816304949844595e-06, + "loss": 2.5413, + "step": 8689000 + }, + { + "epoch": 2.7012576025814194, + "grad_norm": 48.091182708740234, + "learning_rate": 4.979039956976345e-06, + "loss": 2.5664, + "step": 8689500 + }, + { + "epoch": 2.7014130348619063, + "grad_norm": 10.720906257629395, + "learning_rate": 4.9764494189682305e-06, + "loss": 2.5768, + "step": 8690000 + }, + { + "epoch": 2.701568467142393, + "grad_norm": 13.020933151245117, + "learning_rate": 4.973858880960116e-06, + "loss": 2.5655, + "step": 8690500 + }, + { + "epoch": 2.70172389942288, + "grad_norm": 10.601363182067871, + "learning_rate": 4.971268342952001e-06, + "loss": 2.5749, + "step": 8691000 + }, + { + "epoch": 2.701879331703367, + "grad_norm": 9.403172492980957, + "learning_rate": 4.968677804943887e-06, + "loss": 2.5569, + "step": 8691500 + }, + { + "epoch": 2.702034763983854, + "grad_norm": 9.627581596374512, + "learning_rate": 4.966087266935772e-06, + "loss": 2.5679, + "step": 8692000 + }, + { + "epoch": 2.7021901962643406, + "grad_norm": 10.19801139831543, + "learning_rate": 4.963496728927657e-06, + "loss": 2.6066, + "step": 8692500 + }, + { + "epoch": 2.7023456285448275, + "grad_norm": 11.524279594421387, + "learning_rate": 4.960906190919542e-06, + "loss": 2.5653, + "step": 8693000 + }, + { + "epoch": 2.7025010608253144, + "grad_norm": 7.783551216125488, + "learning_rate": 4.958315652911428e-06, + "loss": 2.5947, + "step": 8693500 + }, + { + "epoch": 2.7026564931058012, + "grad_norm": 9.766804695129395, + "learning_rate": 4.955725114903314e-06, + "loss": 2.557, + "step": 8694000 + }, + { + "epoch": 2.702811925386288, + "grad_norm": 24.247995376586914, + "learning_rate": 4.9531345768952e-06, + "loss": 2.6432, + "step": 8694500 + }, + { + "epoch": 2.702967357666775, + "grad_norm": 9.598949432373047, + "learning_rate": 4.950544038887084e-06, + "loss": 2.5868, + "step": 8695000 + }, + { + "epoch": 2.703122789947262, + "grad_norm": 6.553107738494873, + "learning_rate": 4.94795350087897e-06, + "loss": 2.59, + "step": 8695500 + }, + { + "epoch": 2.7032782222277487, + "grad_norm": 10.697094917297363, + "learning_rate": 4.945362962870855e-06, + "loss": 2.5615, + "step": 8696000 + }, + { + "epoch": 2.7034336545082356, + "grad_norm": 9.136187553405762, + "learning_rate": 4.942772424862741e-06, + "loss": 2.5811, + "step": 8696500 + }, + { + "epoch": 2.7035890867887225, + "grad_norm": 9.485158920288086, + "learning_rate": 4.940181886854626e-06, + "loss": 2.5572, + "step": 8697000 + }, + { + "epoch": 2.7037445190692093, + "grad_norm": 9.799162864685059, + "learning_rate": 4.9375913488465115e-06, + "loss": 2.5996, + "step": 8697500 + }, + { + "epoch": 2.703899951349696, + "grad_norm": 10.627315521240234, + "learning_rate": 4.935000810838396e-06, + "loss": 2.5513, + "step": 8698000 + }, + { + "epoch": 2.704055383630183, + "grad_norm": 11.257600784301758, + "learning_rate": 4.9324102728302824e-06, + "loss": 2.5939, + "step": 8698500 + }, + { + "epoch": 2.70421081591067, + "grad_norm": 11.892333984375, + "learning_rate": 4.929819734822168e-06, + "loss": 2.5737, + "step": 8699000 + }, + { + "epoch": 2.704366248191157, + "grad_norm": 10.038484573364258, + "learning_rate": 4.927229196814053e-06, + "loss": 2.548, + "step": 8699500 + }, + { + "epoch": 2.7045216804716437, + "grad_norm": 9.923652648925781, + "learning_rate": 4.924638658805939e-06, + "loss": 2.6412, + "step": 8700000 + }, + { + "epoch": 2.7046771127521305, + "grad_norm": 10.456624031066895, + "learning_rate": 4.922048120797823e-06, + "loss": 2.5965, + "step": 8700500 + }, + { + "epoch": 2.7048325450326174, + "grad_norm": 9.648409843444824, + "learning_rate": 4.919457582789709e-06, + "loss": 2.5741, + "step": 8701000 + }, + { + "epoch": 2.7049879773131043, + "grad_norm": 27.51182746887207, + "learning_rate": 4.916867044781594e-06, + "loss": 2.6346, + "step": 8701500 + }, + { + "epoch": 2.705143409593591, + "grad_norm": 10.267056465148926, + "learning_rate": 4.91427650677348e-06, + "loss": 2.6278, + "step": 8702000 + }, + { + "epoch": 2.705298841874078, + "grad_norm": 13.241268157958984, + "learning_rate": 4.911685968765365e-06, + "loss": 2.568, + "step": 8702500 + }, + { + "epoch": 2.705454274154565, + "grad_norm": 8.954083442687988, + "learning_rate": 4.909095430757251e-06, + "loss": 2.5895, + "step": 8703000 + }, + { + "epoch": 2.7056097064350517, + "grad_norm": 15.994912147521973, + "learning_rate": 4.906504892749136e-06, + "loss": 2.5594, + "step": 8703500 + }, + { + "epoch": 2.7057651387155386, + "grad_norm": 10.609227180480957, + "learning_rate": 4.903914354741022e-06, + "loss": 2.6092, + "step": 8704000 + }, + { + "epoch": 2.7059205709960255, + "grad_norm": 10.336200714111328, + "learning_rate": 4.901323816732907e-06, + "loss": 2.5842, + "step": 8704500 + }, + { + "epoch": 2.7060760032765123, + "grad_norm": 9.39786148071289, + "learning_rate": 4.8987332787247926e-06, + "loss": 2.5727, + "step": 8705000 + }, + { + "epoch": 2.706231435556999, + "grad_norm": 9.666547775268555, + "learning_rate": 4.896142740716678e-06, + "loss": 2.5654, + "step": 8705500 + }, + { + "epoch": 2.7063868678374865, + "grad_norm": 11.33752155303955, + "learning_rate": 4.893552202708563e-06, + "loss": 2.6047, + "step": 8706000 + }, + { + "epoch": 2.706542300117973, + "grad_norm": 9.147972106933594, + "learning_rate": 4.890961664700448e-06, + "loss": 2.5887, + "step": 8706500 + }, + { + "epoch": 2.7066977323984602, + "grad_norm": 8.717978477478027, + "learning_rate": 4.8883711266923335e-06, + "loss": 2.5269, + "step": 8707000 + }, + { + "epoch": 2.7068531646789467, + "grad_norm": 7.697672367095947, + "learning_rate": 4.88578058868422e-06, + "loss": 2.5915, + "step": 8707500 + }, + { + "epoch": 2.707008596959434, + "grad_norm": 11.336021423339844, + "learning_rate": 4.883190050676105e-06, + "loss": 2.5646, + "step": 8708000 + }, + { + "epoch": 2.7071640292399204, + "grad_norm": 11.154090881347656, + "learning_rate": 4.88059951266799e-06, + "loss": 2.5757, + "step": 8708500 + }, + { + "epoch": 2.7073194615204077, + "grad_norm": 30.90177345275879, + "learning_rate": 4.878008974659875e-06, + "loss": 2.6014, + "step": 8709000 + }, + { + "epoch": 2.707474893800894, + "grad_norm": 8.818729400634766, + "learning_rate": 4.875418436651761e-06, + "loss": 2.5729, + "step": 8709500 + }, + { + "epoch": 2.7076303260813814, + "grad_norm": 8.00648021697998, + "learning_rate": 4.872827898643646e-06, + "loss": 2.578, + "step": 8710000 + }, + { + "epoch": 2.707785758361868, + "grad_norm": 10.40027141571045, + "learning_rate": 4.870237360635532e-06, + "loss": 2.5445, + "step": 8710500 + }, + { + "epoch": 2.707941190642355, + "grad_norm": 10.462164878845215, + "learning_rate": 4.867646822627417e-06, + "loss": 2.6103, + "step": 8711000 + }, + { + "epoch": 2.7080966229228416, + "grad_norm": 9.677826881408691, + "learning_rate": 4.865056284619302e-06, + "loss": 2.5694, + "step": 8711500 + }, + { + "epoch": 2.708252055203329, + "grad_norm": 8.387187957763672, + "learning_rate": 4.862465746611188e-06, + "loss": 2.5915, + "step": 8712000 + }, + { + "epoch": 2.7084074874838158, + "grad_norm": 12.252837181091309, + "learning_rate": 4.859875208603074e-06, + "loss": 2.6278, + "step": 8712500 + }, + { + "epoch": 2.7085629197643026, + "grad_norm": 7.66862678527832, + "learning_rate": 4.857284670594959e-06, + "loss": 2.6008, + "step": 8713000 + }, + { + "epoch": 2.7087183520447895, + "grad_norm": 12.026620864868164, + "learning_rate": 4.8546941325868445e-06, + "loss": 2.5462, + "step": 8713500 + }, + { + "epoch": 2.7088737843252764, + "grad_norm": 9.714641571044922, + "learning_rate": 4.852103594578729e-06, + "loss": 2.5732, + "step": 8714000 + }, + { + "epoch": 2.7090292166057632, + "grad_norm": 11.339978218078613, + "learning_rate": 4.849513056570615e-06, + "loss": 2.6106, + "step": 8714500 + }, + { + "epoch": 2.70918464888625, + "grad_norm": 13.1220064163208, + "learning_rate": 4.8469225185625e-06, + "loss": 2.573, + "step": 8715000 + }, + { + "epoch": 2.709340081166737, + "grad_norm": 10.744321823120117, + "learning_rate": 4.8443319805543855e-06, + "loss": 2.5794, + "step": 8715500 + }, + { + "epoch": 2.709495513447224, + "grad_norm": 11.323965072631836, + "learning_rate": 4.841741442546271e-06, + "loss": 2.5976, + "step": 8716000 + }, + { + "epoch": 2.7096509457277107, + "grad_norm": 10.097752571105957, + "learning_rate": 4.839150904538156e-06, + "loss": 2.6125, + "step": 8716500 + }, + { + "epoch": 2.7098063780081976, + "grad_norm": 9.677225112915039, + "learning_rate": 4.836560366530042e-06, + "loss": 2.5523, + "step": 8717000 + }, + { + "epoch": 2.7099618102886844, + "grad_norm": 10.33466911315918, + "learning_rate": 4.833969828521927e-06, + "loss": 2.5992, + "step": 8717500 + }, + { + "epoch": 2.7101172425691713, + "grad_norm": 9.649961471557617, + "learning_rate": 4.831379290513813e-06, + "loss": 2.5114, + "step": 8718000 + }, + { + "epoch": 2.710272674849658, + "grad_norm": 7.263194561004639, + "learning_rate": 4.828788752505698e-06, + "loss": 2.6037, + "step": 8718500 + }, + { + "epoch": 2.710428107130145, + "grad_norm": 10.394052505493164, + "learning_rate": 4.826198214497584e-06, + "loss": 2.5957, + "step": 8719000 + }, + { + "epoch": 2.710583539410632, + "grad_norm": 22.500253677368164, + "learning_rate": 4.823607676489468e-06, + "loss": 2.5532, + "step": 8719500 + }, + { + "epoch": 2.7107389716911188, + "grad_norm": 10.365300178527832, + "learning_rate": 4.821017138481354e-06, + "loss": 2.5492, + "step": 8720000 + }, + { + "epoch": 2.7108944039716056, + "grad_norm": 10.655320167541504, + "learning_rate": 4.818426600473239e-06, + "loss": 2.564, + "step": 8720500 + }, + { + "epoch": 2.7110498362520925, + "grad_norm": 57.30172348022461, + "learning_rate": 4.8158360624651256e-06, + "loss": 2.6076, + "step": 8721000 + }, + { + "epoch": 2.7112052685325794, + "grad_norm": 8.092366218566895, + "learning_rate": 4.813245524457011e-06, + "loss": 2.592, + "step": 8721500 + }, + { + "epoch": 2.7113607008130662, + "grad_norm": 7.249851703643799, + "learning_rate": 4.810654986448896e-06, + "loss": 2.5563, + "step": 8722000 + }, + { + "epoch": 2.711516133093553, + "grad_norm": 8.854697227478027, + "learning_rate": 4.808064448440781e-06, + "loss": 2.5665, + "step": 8722500 + }, + { + "epoch": 2.71167156537404, + "grad_norm": 10.738880157470703, + "learning_rate": 4.8054739104326665e-06, + "loss": 2.5679, + "step": 8723000 + }, + { + "epoch": 2.711826997654527, + "grad_norm": 17.432157516479492, + "learning_rate": 4.802883372424552e-06, + "loss": 2.5854, + "step": 8723500 + }, + { + "epoch": 2.7119824299350137, + "grad_norm": 9.154621124267578, + "learning_rate": 4.8002928344164375e-06, + "loss": 2.6031, + "step": 8724000 + }, + { + "epoch": 2.7121378622155006, + "grad_norm": 11.695857048034668, + "learning_rate": 4.797702296408323e-06, + "loss": 2.6656, + "step": 8724500 + }, + { + "epoch": 2.7122932944959874, + "grad_norm": 12.40665340423584, + "learning_rate": 4.795111758400208e-06, + "loss": 2.6175, + "step": 8725000 + }, + { + "epoch": 2.7124487267764743, + "grad_norm": 12.110198974609375, + "learning_rate": 4.792521220392094e-06, + "loss": 2.5871, + "step": 8725500 + }, + { + "epoch": 2.712604159056961, + "grad_norm": 14.7326021194458, + "learning_rate": 4.789930682383979e-06, + "loss": 2.5197, + "step": 8726000 + }, + { + "epoch": 2.712759591337448, + "grad_norm": 9.515284538269043, + "learning_rate": 4.787340144375865e-06, + "loss": 2.5581, + "step": 8726500 + }, + { + "epoch": 2.712915023617935, + "grad_norm": 10.905936241149902, + "learning_rate": 4.78474960636775e-06, + "loss": 2.6013, + "step": 8727000 + }, + { + "epoch": 2.7130704558984218, + "grad_norm": 10.136868476867676, + "learning_rate": 4.782159068359636e-06, + "loss": 2.6085, + "step": 8727500 + }, + { + "epoch": 2.7132258881789086, + "grad_norm": 39.83560562133789, + "learning_rate": 4.77956853035152e-06, + "loss": 2.5555, + "step": 8728000 + }, + { + "epoch": 2.7133813204593955, + "grad_norm": 8.047897338867188, + "learning_rate": 4.776977992343406e-06, + "loss": 2.6239, + "step": 8728500 + }, + { + "epoch": 2.7135367527398824, + "grad_norm": 9.858296394348145, + "learning_rate": 4.774387454335291e-06, + "loss": 2.5997, + "step": 8729000 + }, + { + "epoch": 2.7136921850203692, + "grad_norm": 10.474103927612305, + "learning_rate": 4.7717969163271775e-06, + "loss": 2.5668, + "step": 8729500 + }, + { + "epoch": 2.713847617300856, + "grad_norm": 9.604287147521973, + "learning_rate": 4.769206378319063e-06, + "loss": 2.5962, + "step": 8730000 + }, + { + "epoch": 2.7140030495813434, + "grad_norm": 8.793280601501465, + "learning_rate": 4.766615840310948e-06, + "loss": 2.6333, + "step": 8730500 + }, + { + "epoch": 2.71415848186183, + "grad_norm": 10.780016899108887, + "learning_rate": 4.764025302302833e-06, + "loss": 2.5622, + "step": 8731000 + }, + { + "epoch": 2.714313914142317, + "grad_norm": 12.571551322937012, + "learning_rate": 4.7614347642947185e-06, + "loss": 2.5966, + "step": 8731500 + }, + { + "epoch": 2.7144693464228036, + "grad_norm": 20.982370376586914, + "learning_rate": 4.758844226286604e-06, + "loss": 2.5973, + "step": 8732000 + }, + { + "epoch": 2.714624778703291, + "grad_norm": 11.742409706115723, + "learning_rate": 4.7562536882784894e-06, + "loss": 2.5688, + "step": 8732500 + }, + { + "epoch": 2.7147802109837773, + "grad_norm": 13.351696968078613, + "learning_rate": 4.753663150270375e-06, + "loss": 2.6036, + "step": 8733000 + }, + { + "epoch": 2.7149356432642646, + "grad_norm": 10.794952392578125, + "learning_rate": 4.7510726122622595e-06, + "loss": 2.6028, + "step": 8733500 + }, + { + "epoch": 2.715091075544751, + "grad_norm": 11.10789680480957, + "learning_rate": 4.748482074254146e-06, + "loss": 2.5719, + "step": 8734000 + }, + { + "epoch": 2.7152465078252384, + "grad_norm": 10.601815223693848, + "learning_rate": 4.745891536246031e-06, + "loss": 2.6076, + "step": 8734500 + }, + { + "epoch": 2.7154019401057248, + "grad_norm": 10.977173805236816, + "learning_rate": 4.743300998237917e-06, + "loss": 2.5726, + "step": 8735000 + }, + { + "epoch": 2.715557372386212, + "grad_norm": 10.527423858642578, + "learning_rate": 4.740710460229802e-06, + "loss": 2.5587, + "step": 8735500 + }, + { + "epoch": 2.715712804666699, + "grad_norm": 12.903307914733887, + "learning_rate": 4.738119922221687e-06, + "loss": 2.5761, + "step": 8736000 + }, + { + "epoch": 2.715868236947186, + "grad_norm": 10.460530281066895, + "learning_rate": 4.735529384213572e-06, + "loss": 2.5967, + "step": 8736500 + }, + { + "epoch": 2.7160236692276727, + "grad_norm": 8.80660343170166, + "learning_rate": 4.732938846205458e-06, + "loss": 2.5609, + "step": 8737000 + }, + { + "epoch": 2.7161791015081596, + "grad_norm": 18.149789810180664, + "learning_rate": 4.730348308197343e-06, + "loss": 2.5526, + "step": 8737500 + }, + { + "epoch": 2.7163345337886464, + "grad_norm": 9.381339073181152, + "learning_rate": 4.727757770189229e-06, + "loss": 2.5943, + "step": 8738000 + }, + { + "epoch": 2.7164899660691333, + "grad_norm": 23.27642250061035, + "learning_rate": 4.725167232181114e-06, + "loss": 2.5263, + "step": 8738500 + }, + { + "epoch": 2.71664539834962, + "grad_norm": 10.760457038879395, + "learning_rate": 4.7225766941729995e-06, + "loss": 2.5625, + "step": 8739000 + }, + { + "epoch": 2.716800830630107, + "grad_norm": 9.00622844696045, + "learning_rate": 4.719986156164885e-06, + "loss": 2.6016, + "step": 8739500 + }, + { + "epoch": 2.716956262910594, + "grad_norm": 11.386616706848145, + "learning_rate": 4.7173956181567705e-06, + "loss": 2.5971, + "step": 8740000 + }, + { + "epoch": 2.7171116951910808, + "grad_norm": 15.350517272949219, + "learning_rate": 4.714805080148656e-06, + "loss": 2.569, + "step": 8740500 + }, + { + "epoch": 2.7172671274715676, + "grad_norm": 10.610836029052734, + "learning_rate": 4.712214542140541e-06, + "loss": 2.5402, + "step": 8741000 + }, + { + "epoch": 2.7174225597520545, + "grad_norm": 10.948580741882324, + "learning_rate": 4.709624004132426e-06, + "loss": 2.5631, + "step": 8741500 + }, + { + "epoch": 2.7175779920325414, + "grad_norm": 8.467013359069824, + "learning_rate": 4.7070334661243115e-06, + "loss": 2.5324, + "step": 8742000 + }, + { + "epoch": 2.7177334243130282, + "grad_norm": 9.755630493164062, + "learning_rate": 4.704442928116197e-06, + "loss": 2.589, + "step": 8742500 + }, + { + "epoch": 2.717888856593515, + "grad_norm": 9.587398529052734, + "learning_rate": 4.701852390108083e-06, + "loss": 2.5991, + "step": 8743000 + }, + { + "epoch": 2.718044288874002, + "grad_norm": 10.612990379333496, + "learning_rate": 4.699261852099969e-06, + "loss": 2.5444, + "step": 8743500 + }, + { + "epoch": 2.718199721154489, + "grad_norm": 10.48941421508789, + "learning_rate": 4.696671314091853e-06, + "loss": 2.5782, + "step": 8744000 + }, + { + "epoch": 2.7183551534349757, + "grad_norm": 8.832075119018555, + "learning_rate": 4.694080776083739e-06, + "loss": 2.5624, + "step": 8744500 + }, + { + "epoch": 2.7185105857154626, + "grad_norm": 10.575312614440918, + "learning_rate": 4.691490238075624e-06, + "loss": 2.5561, + "step": 8745000 + }, + { + "epoch": 2.7186660179959494, + "grad_norm": 10.381171226501465, + "learning_rate": 4.68889970006751e-06, + "loss": 2.6199, + "step": 8745500 + }, + { + "epoch": 2.7188214502764363, + "grad_norm": 11.78010368347168, + "learning_rate": 4.686309162059395e-06, + "loss": 2.5772, + "step": 8746000 + }, + { + "epoch": 2.718976882556923, + "grad_norm": 10.98022747039795, + "learning_rate": 4.683718624051281e-06, + "loss": 2.545, + "step": 8746500 + }, + { + "epoch": 2.71913231483741, + "grad_norm": 14.126974105834961, + "learning_rate": 4.681128086043165e-06, + "loss": 2.5939, + "step": 8747000 + }, + { + "epoch": 2.719287747117897, + "grad_norm": 8.843700408935547, + "learning_rate": 4.6785375480350515e-06, + "loss": 2.5674, + "step": 8747500 + }, + { + "epoch": 2.7194431793983838, + "grad_norm": 9.474716186523438, + "learning_rate": 4.675947010026937e-06, + "loss": 2.5939, + "step": 8748000 + }, + { + "epoch": 2.7195986116788706, + "grad_norm": 9.740696907043457, + "learning_rate": 4.6733564720188224e-06, + "loss": 2.5874, + "step": 8748500 + }, + { + "epoch": 2.7197540439593575, + "grad_norm": 9.078758239746094, + "learning_rate": 4.670765934010708e-06, + "loss": 2.5835, + "step": 8749000 + }, + { + "epoch": 2.7199094762398444, + "grad_norm": 11.58473014831543, + "learning_rate": 4.6681753960025925e-06, + "loss": 2.5962, + "step": 8749500 + }, + { + "epoch": 2.7200649085203312, + "grad_norm": 11.5236177444458, + "learning_rate": 4.665584857994478e-06, + "loss": 2.5866, + "step": 8750000 + }, + { + "epoch": 2.720220340800818, + "grad_norm": 22.8742733001709, + "learning_rate": 4.662994319986363e-06, + "loss": 2.5761, + "step": 8750500 + }, + { + "epoch": 2.720375773081305, + "grad_norm": 9.70450210571289, + "learning_rate": 4.660403781978249e-06, + "loss": 2.4979, + "step": 8751000 + }, + { + "epoch": 2.720531205361792, + "grad_norm": 8.987066268920898, + "learning_rate": 4.657813243970134e-06, + "loss": 2.5591, + "step": 8751500 + }, + { + "epoch": 2.7206866376422787, + "grad_norm": 8.739566802978516, + "learning_rate": 4.65522270596202e-06, + "loss": 2.583, + "step": 8752000 + }, + { + "epoch": 2.7208420699227656, + "grad_norm": 11.111477851867676, + "learning_rate": 4.652632167953905e-06, + "loss": 2.6374, + "step": 8752500 + }, + { + "epoch": 2.7209975022032524, + "grad_norm": 11.468490600585938, + "learning_rate": 4.650041629945791e-06, + "loss": 2.5609, + "step": 8753000 + }, + { + "epoch": 2.7211529344837393, + "grad_norm": 12.616913795471191, + "learning_rate": 4.647451091937676e-06, + "loss": 2.5615, + "step": 8753500 + }, + { + "epoch": 2.7213083667642266, + "grad_norm": 11.30484390258789, + "learning_rate": 4.644860553929562e-06, + "loss": 2.5826, + "step": 8754000 + }, + { + "epoch": 2.721463799044713, + "grad_norm": 10.163296699523926, + "learning_rate": 4.642270015921447e-06, + "loss": 2.6049, + "step": 8754500 + }, + { + "epoch": 2.7216192313252003, + "grad_norm": 12.070579528808594, + "learning_rate": 4.639679477913332e-06, + "loss": 2.5622, + "step": 8755000 + }, + { + "epoch": 2.7217746636056868, + "grad_norm": 10.311517715454102, + "learning_rate": 4.637088939905217e-06, + "loss": 2.5706, + "step": 8755500 + }, + { + "epoch": 2.721930095886174, + "grad_norm": 9.878079414367676, + "learning_rate": 4.634498401897103e-06, + "loss": 2.5986, + "step": 8756000 + }, + { + "epoch": 2.7220855281666605, + "grad_norm": 11.850616455078125, + "learning_rate": 4.631907863888989e-06, + "loss": 2.5432, + "step": 8756500 + }, + { + "epoch": 2.722240960447148, + "grad_norm": 8.70861530303955, + "learning_rate": 4.629317325880874e-06, + "loss": 2.5315, + "step": 8757000 + }, + { + "epoch": 2.7223963927276342, + "grad_norm": 10.009892463684082, + "learning_rate": 4.626726787872759e-06, + "loss": 2.5921, + "step": 8757500 + }, + { + "epoch": 2.7225518250081215, + "grad_norm": 8.89759635925293, + "learning_rate": 4.6241362498646445e-06, + "loss": 2.5845, + "step": 8758000 + }, + { + "epoch": 2.722707257288608, + "grad_norm": 10.009660720825195, + "learning_rate": 4.62154571185653e-06, + "loss": 2.5625, + "step": 8758500 + }, + { + "epoch": 2.7228626895690953, + "grad_norm": 9.3422269821167, + "learning_rate": 4.618955173848415e-06, + "loss": 2.5966, + "step": 8759000 + }, + { + "epoch": 2.7230181218495817, + "grad_norm": 45.409542083740234, + "learning_rate": 4.616364635840301e-06, + "loss": 2.5667, + "step": 8759500 + }, + { + "epoch": 2.723173554130069, + "grad_norm": 8.583413124084473, + "learning_rate": 4.613774097832186e-06, + "loss": 2.5701, + "step": 8760000 + }, + { + "epoch": 2.723328986410556, + "grad_norm": 8.133138656616211, + "learning_rate": 4.611183559824071e-06, + "loss": 2.6184, + "step": 8760500 + }, + { + "epoch": 2.7234844186910427, + "grad_norm": 15.09830379486084, + "learning_rate": 4.608593021815957e-06, + "loss": 2.568, + "step": 8761000 + }, + { + "epoch": 2.7236398509715296, + "grad_norm": 13.009496688842773, + "learning_rate": 4.606002483807843e-06, + "loss": 2.5744, + "step": 8761500 + }, + { + "epoch": 2.7237952832520165, + "grad_norm": 11.679337501525879, + "learning_rate": 4.603411945799728e-06, + "loss": 2.5818, + "step": 8762000 + }, + { + "epoch": 2.7239507155325033, + "grad_norm": 16.003084182739258, + "learning_rate": 4.600821407791614e-06, + "loss": 2.6149, + "step": 8762500 + }, + { + "epoch": 2.72410614781299, + "grad_norm": 11.653806686401367, + "learning_rate": 4.598230869783498e-06, + "loss": 2.562, + "step": 8763000 + }, + { + "epoch": 2.724261580093477, + "grad_norm": 12.097158432006836, + "learning_rate": 4.595640331775384e-06, + "loss": 2.5714, + "step": 8763500 + }, + { + "epoch": 2.724417012373964, + "grad_norm": 10.788671493530273, + "learning_rate": 4.593049793767269e-06, + "loss": 2.5821, + "step": 8764000 + }, + { + "epoch": 2.724572444654451, + "grad_norm": 8.688811302185059, + "learning_rate": 4.590459255759155e-06, + "loss": 2.6009, + "step": 8764500 + }, + { + "epoch": 2.7247278769349377, + "grad_norm": 9.117767333984375, + "learning_rate": 4.58786871775104e-06, + "loss": 2.5977, + "step": 8765000 + }, + { + "epoch": 2.7248833092154245, + "grad_norm": 11.400433540344238, + "learning_rate": 4.5852781797429255e-06, + "loss": 2.5608, + "step": 8765500 + }, + { + "epoch": 2.7250387414959114, + "grad_norm": 12.210308074951172, + "learning_rate": 4.582687641734811e-06, + "loss": 2.5932, + "step": 8766000 + }, + { + "epoch": 2.7251941737763983, + "grad_norm": 11.096680641174316, + "learning_rate": 4.580097103726696e-06, + "loss": 2.6018, + "step": 8766500 + }, + { + "epoch": 2.725349606056885, + "grad_norm": 9.145647048950195, + "learning_rate": 4.577506565718582e-06, + "loss": 2.5289, + "step": 8767000 + }, + { + "epoch": 2.725505038337372, + "grad_norm": 32.895198822021484, + "learning_rate": 4.574916027710467e-06, + "loss": 2.5463, + "step": 8767500 + }, + { + "epoch": 2.725660470617859, + "grad_norm": 9.35434627532959, + "learning_rate": 4.572325489702353e-06, + "loss": 2.5547, + "step": 8768000 + }, + { + "epoch": 2.7258159028983457, + "grad_norm": 9.694953918457031, + "learning_rate": 4.569734951694237e-06, + "loss": 2.5436, + "step": 8768500 + }, + { + "epoch": 2.7259713351788326, + "grad_norm": 8.737242698669434, + "learning_rate": 4.567144413686123e-06, + "loss": 2.6175, + "step": 8769000 + }, + { + "epoch": 2.7261267674593195, + "grad_norm": 11.42427921295166, + "learning_rate": 4.564553875678008e-06, + "loss": 2.5893, + "step": 8769500 + }, + { + "epoch": 2.7262821997398063, + "grad_norm": 8.233266830444336, + "learning_rate": 4.561963337669895e-06, + "loss": 2.5837, + "step": 8770000 + }, + { + "epoch": 2.726437632020293, + "grad_norm": 21.570526123046875, + "learning_rate": 4.55937279966178e-06, + "loss": 2.5333, + "step": 8770500 + }, + { + "epoch": 2.72659306430078, + "grad_norm": 8.130749702453613, + "learning_rate": 4.556782261653665e-06, + "loss": 2.5718, + "step": 8771000 + }, + { + "epoch": 2.726748496581267, + "grad_norm": 8.836437225341797, + "learning_rate": 4.55419172364555e-06, + "loss": 2.5878, + "step": 8771500 + }, + { + "epoch": 2.726903928861754, + "grad_norm": 9.311042785644531, + "learning_rate": 4.551601185637436e-06, + "loss": 2.5788, + "step": 8772000 + }, + { + "epoch": 2.7270593611422407, + "grad_norm": 12.125858306884766, + "learning_rate": 4.549010647629321e-06, + "loss": 2.5826, + "step": 8772500 + }, + { + "epoch": 2.7272147934227275, + "grad_norm": 9.61447811126709, + "learning_rate": 4.5464201096212065e-06, + "loss": 2.5271, + "step": 8773000 + }, + { + "epoch": 2.7273702257032144, + "grad_norm": 10.980131149291992, + "learning_rate": 4.543829571613092e-06, + "loss": 2.5791, + "step": 8773500 + }, + { + "epoch": 2.7275256579837013, + "grad_norm": 11.374935150146484, + "learning_rate": 4.5412390336049775e-06, + "loss": 2.5846, + "step": 8774000 + }, + { + "epoch": 2.727681090264188, + "grad_norm": 16.5886287689209, + "learning_rate": 4.538648495596863e-06, + "loss": 2.545, + "step": 8774500 + }, + { + "epoch": 2.727836522544675, + "grad_norm": 10.817718505859375, + "learning_rate": 4.536057957588748e-06, + "loss": 2.5767, + "step": 8775000 + }, + { + "epoch": 2.727991954825162, + "grad_norm": 14.080698013305664, + "learning_rate": 4.533467419580634e-06, + "loss": 2.5291, + "step": 8775500 + }, + { + "epoch": 2.7281473871056487, + "grad_norm": 11.033050537109375, + "learning_rate": 4.530876881572519e-06, + "loss": 2.5919, + "step": 8776000 + }, + { + "epoch": 2.7283028193861356, + "grad_norm": 8.78670597076416, + "learning_rate": 4.528286343564405e-06, + "loss": 2.5663, + "step": 8776500 + }, + { + "epoch": 2.7284582516666225, + "grad_norm": 8.180085182189941, + "learning_rate": 4.525695805556289e-06, + "loss": 2.5238, + "step": 8777000 + }, + { + "epoch": 2.7286136839471093, + "grad_norm": 7.469151973724365, + "learning_rate": 4.523105267548175e-06, + "loss": 2.5802, + "step": 8777500 + }, + { + "epoch": 2.728769116227596, + "grad_norm": 8.786879539489746, + "learning_rate": 4.52051472954006e-06, + "loss": 2.572, + "step": 8778000 + }, + { + "epoch": 2.7289245485080835, + "grad_norm": 11.435747146606445, + "learning_rate": 4.517924191531946e-06, + "loss": 2.6235, + "step": 8778500 + }, + { + "epoch": 2.72907998078857, + "grad_norm": 9.856240272521973, + "learning_rate": 4.515333653523832e-06, + "loss": 2.5975, + "step": 8779000 + }, + { + "epoch": 2.7292354130690573, + "grad_norm": 9.996213912963867, + "learning_rate": 4.512743115515717e-06, + "loss": 2.5835, + "step": 8779500 + }, + { + "epoch": 2.7293908453495437, + "grad_norm": 15.073365211486816, + "learning_rate": 4.510152577507602e-06, + "loss": 2.5376, + "step": 8780000 + }, + { + "epoch": 2.729546277630031, + "grad_norm": 7.786551475524902, + "learning_rate": 4.507562039499488e-06, + "loss": 2.554, + "step": 8780500 + }, + { + "epoch": 2.7297017099105174, + "grad_norm": 11.714203834533691, + "learning_rate": 4.504971501491373e-06, + "loss": 2.5501, + "step": 8781000 + }, + { + "epoch": 2.7298571421910047, + "grad_norm": 8.83115291595459, + "learning_rate": 4.5023809634832585e-06, + "loss": 2.6184, + "step": 8781500 + }, + { + "epoch": 2.730012574471491, + "grad_norm": 11.05189037322998, + "learning_rate": 4.499790425475144e-06, + "loss": 2.6081, + "step": 8782000 + }, + { + "epoch": 2.7301680067519785, + "grad_norm": 45.84932327270508, + "learning_rate": 4.497199887467029e-06, + "loss": 2.6188, + "step": 8782500 + }, + { + "epoch": 2.730323439032465, + "grad_norm": 10.281622886657715, + "learning_rate": 4.494609349458914e-06, + "loss": 2.5881, + "step": 8783000 + }, + { + "epoch": 2.730478871312952, + "grad_norm": 14.578015327453613, + "learning_rate": 4.4920188114508e-06, + "loss": 2.5755, + "step": 8783500 + }, + { + "epoch": 2.7306343035934386, + "grad_norm": 20.986305236816406, + "learning_rate": 4.489428273442686e-06, + "loss": 2.5781, + "step": 8784000 + }, + { + "epoch": 2.730789735873926, + "grad_norm": 11.946401596069336, + "learning_rate": 4.486837735434571e-06, + "loss": 2.5566, + "step": 8784500 + }, + { + "epoch": 2.730945168154413, + "grad_norm": 14.18072509765625, + "learning_rate": 4.484247197426456e-06, + "loss": 2.5644, + "step": 8785000 + }, + { + "epoch": 2.7311006004348997, + "grad_norm": 10.76211929321289, + "learning_rate": 4.481656659418341e-06, + "loss": 2.5756, + "step": 8785500 + }, + { + "epoch": 2.7312560327153865, + "grad_norm": 8.756689071655273, + "learning_rate": 4.479066121410227e-06, + "loss": 2.5978, + "step": 8786000 + }, + { + "epoch": 2.7314114649958734, + "grad_norm": 12.277219772338867, + "learning_rate": 4.476475583402112e-06, + "loss": 2.5682, + "step": 8786500 + }, + { + "epoch": 2.7315668972763603, + "grad_norm": 8.382240295410156, + "learning_rate": 4.473885045393998e-06, + "loss": 2.5385, + "step": 8787000 + }, + { + "epoch": 2.731722329556847, + "grad_norm": 18.042081832885742, + "learning_rate": 4.471294507385883e-06, + "loss": 2.5547, + "step": 8787500 + }, + { + "epoch": 2.731877761837334, + "grad_norm": 14.854573249816895, + "learning_rate": 4.468703969377769e-06, + "loss": 2.6045, + "step": 8788000 + }, + { + "epoch": 2.732033194117821, + "grad_norm": 11.310588836669922, + "learning_rate": 4.466113431369654e-06, + "loss": 2.5874, + "step": 8788500 + }, + { + "epoch": 2.7321886263983077, + "grad_norm": 9.911949157714844, + "learning_rate": 4.4635228933615396e-06, + "loss": 2.5803, + "step": 8789000 + }, + { + "epoch": 2.7323440586787946, + "grad_norm": 9.841390609741211, + "learning_rate": 4.460932355353425e-06, + "loss": 2.5721, + "step": 8789500 + }, + { + "epoch": 2.7324994909592815, + "grad_norm": 10.592936515808105, + "learning_rate": 4.4583418173453105e-06, + "loss": 2.5401, + "step": 8790000 + }, + { + "epoch": 2.7326549232397683, + "grad_norm": 15.482800483703613, + "learning_rate": 4.455751279337195e-06, + "loss": 2.6018, + "step": 8790500 + }, + { + "epoch": 2.732810355520255, + "grad_norm": 8.56532096862793, + "learning_rate": 4.4531607413290805e-06, + "loss": 2.5944, + "step": 8791000 + }, + { + "epoch": 2.732965787800742, + "grad_norm": 17.83604621887207, + "learning_rate": 4.450570203320966e-06, + "loss": 2.5922, + "step": 8791500 + }, + { + "epoch": 2.733121220081229, + "grad_norm": 10.807418823242188, + "learning_rate": 4.4479796653128515e-06, + "loss": 2.5838, + "step": 8792000 + }, + { + "epoch": 2.733276652361716, + "grad_norm": 16.681406021118164, + "learning_rate": 4.445389127304738e-06, + "loss": 2.5799, + "step": 8792500 + }, + { + "epoch": 2.7334320846422027, + "grad_norm": 10.91028118133545, + "learning_rate": 4.442798589296622e-06, + "loss": 2.6165, + "step": 8793000 + }, + { + "epoch": 2.7335875169226895, + "grad_norm": 10.082151412963867, + "learning_rate": 4.440208051288508e-06, + "loss": 2.549, + "step": 8793500 + }, + { + "epoch": 2.7337429492031764, + "grad_norm": 11.272806167602539, + "learning_rate": 4.437617513280393e-06, + "loss": 2.541, + "step": 8794000 + }, + { + "epoch": 2.7338983814836633, + "grad_norm": 10.446571350097656, + "learning_rate": 4.435026975272279e-06, + "loss": 2.6034, + "step": 8794500 + }, + { + "epoch": 2.73405381376415, + "grad_norm": 11.59419059753418, + "learning_rate": 4.432436437264164e-06, + "loss": 2.5735, + "step": 8795000 + }, + { + "epoch": 2.734209246044637, + "grad_norm": 8.944210052490234, + "learning_rate": 4.42984589925605e-06, + "loss": 2.5686, + "step": 8795500 + }, + { + "epoch": 2.734364678325124, + "grad_norm": 8.802640914916992, + "learning_rate": 4.427255361247934e-06, + "loss": 2.5757, + "step": 8796000 + }, + { + "epoch": 2.7345201106056107, + "grad_norm": 8.98735523223877, + "learning_rate": 4.424664823239821e-06, + "loss": 2.6025, + "step": 8796500 + }, + { + "epoch": 2.7346755428860976, + "grad_norm": 9.641955375671387, + "learning_rate": 4.422074285231706e-06, + "loss": 2.5704, + "step": 8797000 + }, + { + "epoch": 2.7348309751665845, + "grad_norm": 10.3732328414917, + "learning_rate": 4.4194837472235915e-06, + "loss": 2.5931, + "step": 8797500 + }, + { + "epoch": 2.7349864074470713, + "grad_norm": 7.855300426483154, + "learning_rate": 4.416893209215477e-06, + "loss": 2.6096, + "step": 8798000 + }, + { + "epoch": 2.735141839727558, + "grad_norm": 13.449190139770508, + "learning_rate": 4.414302671207362e-06, + "loss": 2.5982, + "step": 8798500 + }, + { + "epoch": 2.735297272008045, + "grad_norm": 9.503920555114746, + "learning_rate": 4.411712133199247e-06, + "loss": 2.5531, + "step": 8799000 + }, + { + "epoch": 2.735452704288532, + "grad_norm": 9.729272842407227, + "learning_rate": 4.4091215951911325e-06, + "loss": 2.5825, + "step": 8799500 + }, + { + "epoch": 2.735608136569019, + "grad_norm": 6.861602306365967, + "learning_rate": 4.406531057183018e-06, + "loss": 2.5434, + "step": 8800000 + }, + { + "epoch": 2.7357635688495057, + "grad_norm": 8.106501579284668, + "learning_rate": 4.403940519174903e-06, + "loss": 2.5766, + "step": 8800500 + }, + { + "epoch": 2.7359190011299925, + "grad_norm": 10.559459686279297, + "learning_rate": 4.401349981166789e-06, + "loss": 2.5323, + "step": 8801000 + }, + { + "epoch": 2.7360744334104794, + "grad_norm": 10.628509521484375, + "learning_rate": 4.398759443158674e-06, + "loss": 2.5388, + "step": 8801500 + }, + { + "epoch": 2.7362298656909663, + "grad_norm": 12.459831237792969, + "learning_rate": 4.39616890515056e-06, + "loss": 2.5678, + "step": 8802000 + }, + { + "epoch": 2.736385297971453, + "grad_norm": 9.517107963562012, + "learning_rate": 4.393578367142445e-06, + "loss": 2.5915, + "step": 8802500 + }, + { + "epoch": 2.7365407302519404, + "grad_norm": 9.312159538269043, + "learning_rate": 4.390987829134331e-06, + "loss": 2.5712, + "step": 8803000 + }, + { + "epoch": 2.736696162532427, + "grad_norm": 16.083105087280273, + "learning_rate": 4.388397291126216e-06, + "loss": 2.5701, + "step": 8803500 + }, + { + "epoch": 2.736851594812914, + "grad_norm": 9.35362434387207, + "learning_rate": 4.385806753118101e-06, + "loss": 2.5152, + "step": 8804000 + }, + { + "epoch": 2.7370070270934006, + "grad_norm": 9.247461318969727, + "learning_rate": 4.383216215109986e-06, + "loss": 2.5317, + "step": 8804500 + }, + { + "epoch": 2.737162459373888, + "grad_norm": 10.309798240661621, + "learning_rate": 4.380625677101872e-06, + "loss": 2.605, + "step": 8805000 + }, + { + "epoch": 2.7373178916543743, + "grad_norm": 9.883081436157227, + "learning_rate": 4.378035139093758e-06, + "loss": 2.5917, + "step": 8805500 + }, + { + "epoch": 2.7374733239348616, + "grad_norm": 12.914803504943848, + "learning_rate": 4.3754446010856435e-06, + "loss": 2.6127, + "step": 8806000 + }, + { + "epoch": 2.737628756215348, + "grad_norm": 15.271327018737793, + "learning_rate": 4.372854063077528e-06, + "loss": 2.6124, + "step": 8806500 + }, + { + "epoch": 2.7377841884958354, + "grad_norm": 10.740762710571289, + "learning_rate": 4.3702635250694135e-06, + "loss": 2.5593, + "step": 8807000 + }, + { + "epoch": 2.737939620776322, + "grad_norm": 9.45228099822998, + "learning_rate": 4.367672987061299e-06, + "loss": 2.5922, + "step": 8807500 + }, + { + "epoch": 2.738095053056809, + "grad_norm": 12.159478187561035, + "learning_rate": 4.3650824490531845e-06, + "loss": 2.5641, + "step": 8808000 + }, + { + "epoch": 2.738250485337296, + "grad_norm": 9.539369583129883, + "learning_rate": 4.36249191104507e-06, + "loss": 2.5962, + "step": 8808500 + }, + { + "epoch": 2.738405917617783, + "grad_norm": 12.61219310760498, + "learning_rate": 4.359901373036955e-06, + "loss": 2.5335, + "step": 8809000 + }, + { + "epoch": 2.7385613498982697, + "grad_norm": 9.390059471130371, + "learning_rate": 4.35731083502884e-06, + "loss": 2.5725, + "step": 8809500 + }, + { + "epoch": 2.7387167821787566, + "grad_norm": 10.179292678833008, + "learning_rate": 4.354720297020726e-06, + "loss": 2.5352, + "step": 8810000 + }, + { + "epoch": 2.7388722144592434, + "grad_norm": 15.617801666259766, + "learning_rate": 4.352129759012612e-06, + "loss": 2.5473, + "step": 8810500 + }, + { + "epoch": 2.7390276467397303, + "grad_norm": 16.771976470947266, + "learning_rate": 4.349539221004497e-06, + "loss": 2.5578, + "step": 8811000 + }, + { + "epoch": 2.739183079020217, + "grad_norm": 29.686237335205078, + "learning_rate": 4.346948682996383e-06, + "loss": 2.5865, + "step": 8811500 + }, + { + "epoch": 2.739338511300704, + "grad_norm": 19.03600311279297, + "learning_rate": 4.344358144988267e-06, + "loss": 2.5438, + "step": 8812000 + }, + { + "epoch": 2.739493943581191, + "grad_norm": 9.21534252166748, + "learning_rate": 4.341767606980153e-06, + "loss": 2.5492, + "step": 8812500 + }, + { + "epoch": 2.739649375861678, + "grad_norm": 28.687665939331055, + "learning_rate": 4.339177068972038e-06, + "loss": 2.6412, + "step": 8813000 + }, + { + "epoch": 2.7398048081421646, + "grad_norm": 40.52609634399414, + "learning_rate": 4.336586530963924e-06, + "loss": 2.5836, + "step": 8813500 + }, + { + "epoch": 2.7399602404226515, + "grad_norm": 9.628609657287598, + "learning_rate": 4.333995992955809e-06, + "loss": 2.5218, + "step": 8814000 + }, + { + "epoch": 2.7401156727031384, + "grad_norm": 8.131553649902344, + "learning_rate": 4.331405454947695e-06, + "loss": 2.5593, + "step": 8814500 + }, + { + "epoch": 2.7402711049836252, + "grad_norm": 34.94347381591797, + "learning_rate": 4.32881491693958e-06, + "loss": 2.5566, + "step": 8815000 + }, + { + "epoch": 2.740426537264112, + "grad_norm": 35.77507781982422, + "learning_rate": 4.3262243789314655e-06, + "loss": 2.5646, + "step": 8815500 + }, + { + "epoch": 2.740581969544599, + "grad_norm": 10.34979248046875, + "learning_rate": 4.323633840923351e-06, + "loss": 2.5988, + "step": 8816000 + }, + { + "epoch": 2.740737401825086, + "grad_norm": 14.950200080871582, + "learning_rate": 4.321043302915236e-06, + "loss": 2.5605, + "step": 8816500 + }, + { + "epoch": 2.7408928341055727, + "grad_norm": 14.185059547424316, + "learning_rate": 4.318452764907122e-06, + "loss": 2.6169, + "step": 8817000 + }, + { + "epoch": 2.7410482663860596, + "grad_norm": 11.161316871643066, + "learning_rate": 4.3158622268990065e-06, + "loss": 2.5876, + "step": 8817500 + }, + { + "epoch": 2.7412036986665465, + "grad_norm": 12.437235832214355, + "learning_rate": 4.313271688890892e-06, + "loss": 2.6501, + "step": 8818000 + }, + { + "epoch": 2.7413591309470333, + "grad_norm": 53.1884765625, + "learning_rate": 4.310681150882777e-06, + "loss": 2.557, + "step": 8818500 + }, + { + "epoch": 2.74151456322752, + "grad_norm": 9.539620399475098, + "learning_rate": 4.308090612874664e-06, + "loss": 2.546, + "step": 8819000 + }, + { + "epoch": 2.741669995508007, + "grad_norm": 12.146312713623047, + "learning_rate": 4.305500074866549e-06, + "loss": 2.5643, + "step": 8819500 + }, + { + "epoch": 2.741825427788494, + "grad_norm": 8.67601203918457, + "learning_rate": 4.302909536858434e-06, + "loss": 2.5305, + "step": 8820000 + }, + { + "epoch": 2.741980860068981, + "grad_norm": 9.350882530212402, + "learning_rate": 4.300318998850319e-06, + "loss": 2.564, + "step": 8820500 + }, + { + "epoch": 2.7421362923494677, + "grad_norm": 13.806668281555176, + "learning_rate": 4.297728460842205e-06, + "loss": 2.5604, + "step": 8821000 + }, + { + "epoch": 2.7422917246299545, + "grad_norm": 7.720210552215576, + "learning_rate": 4.29513792283409e-06, + "loss": 2.5607, + "step": 8821500 + }, + { + "epoch": 2.7424471569104414, + "grad_norm": 8.651549339294434, + "learning_rate": 4.292547384825976e-06, + "loss": 2.5678, + "step": 8822000 + }, + { + "epoch": 2.7426025891909283, + "grad_norm": 14.40985107421875, + "learning_rate": 4.289956846817861e-06, + "loss": 2.567, + "step": 8822500 + }, + { + "epoch": 2.742758021471415, + "grad_norm": 17.725868225097656, + "learning_rate": 4.2873663088097465e-06, + "loss": 2.6355, + "step": 8823000 + }, + { + "epoch": 2.742913453751902, + "grad_norm": 10.709851264953613, + "learning_rate": 4.284775770801632e-06, + "loss": 2.5701, + "step": 8823500 + }, + { + "epoch": 2.743068886032389, + "grad_norm": 8.435876846313477, + "learning_rate": 4.2821852327935175e-06, + "loss": 2.5757, + "step": 8824000 + }, + { + "epoch": 2.7432243183128757, + "grad_norm": 9.795756340026855, + "learning_rate": 4.279594694785403e-06, + "loss": 2.5942, + "step": 8824500 + }, + { + "epoch": 2.7433797505933626, + "grad_norm": 6.586709022521973, + "learning_rate": 4.277004156777288e-06, + "loss": 2.5321, + "step": 8825000 + }, + { + "epoch": 2.7435351828738495, + "grad_norm": 45.10361862182617, + "learning_rate": 4.274413618769174e-06, + "loss": 2.556, + "step": 8825500 + }, + { + "epoch": 2.7436906151543363, + "grad_norm": 10.615703582763672, + "learning_rate": 4.2718230807610585e-06, + "loss": 2.5973, + "step": 8826000 + }, + { + "epoch": 2.7438460474348236, + "grad_norm": 8.409485816955566, + "learning_rate": 4.269232542752944e-06, + "loss": 2.5604, + "step": 8826500 + }, + { + "epoch": 2.74400147971531, + "grad_norm": 10.373915672302246, + "learning_rate": 4.266642004744829e-06, + "loss": 2.5974, + "step": 8827000 + }, + { + "epoch": 2.7441569119957974, + "grad_norm": 12.134735107421875, + "learning_rate": 4.264051466736715e-06, + "loss": 2.5391, + "step": 8827500 + }, + { + "epoch": 2.744312344276284, + "grad_norm": 13.452338218688965, + "learning_rate": 4.261460928728601e-06, + "loss": 2.5819, + "step": 8828000 + }, + { + "epoch": 2.744467776556771, + "grad_norm": 11.926783561706543, + "learning_rate": 4.258870390720486e-06, + "loss": 2.6015, + "step": 8828500 + }, + { + "epoch": 2.7446232088372575, + "grad_norm": 11.911081314086914, + "learning_rate": 4.256279852712371e-06, + "loss": 2.6124, + "step": 8829000 + }, + { + "epoch": 2.744778641117745, + "grad_norm": 10.020391464233398, + "learning_rate": 4.253689314704257e-06, + "loss": 2.5508, + "step": 8829500 + }, + { + "epoch": 2.7449340733982313, + "grad_norm": 10.199869155883789, + "learning_rate": 4.251098776696142e-06, + "loss": 2.5555, + "step": 8830000 + }, + { + "epoch": 2.7450895056787186, + "grad_norm": 10.226454734802246, + "learning_rate": 4.248508238688028e-06, + "loss": 2.5625, + "step": 8830500 + }, + { + "epoch": 2.745244937959205, + "grad_norm": 8.647929191589355, + "learning_rate": 4.245917700679913e-06, + "loss": 2.5747, + "step": 8831000 + }, + { + "epoch": 2.7454003702396923, + "grad_norm": 11.244651794433594, + "learning_rate": 4.243327162671798e-06, + "loss": 2.5627, + "step": 8831500 + }, + { + "epoch": 2.7455558025201787, + "grad_norm": 10.844593048095703, + "learning_rate": 4.240736624663683e-06, + "loss": 2.52, + "step": 8832000 + }, + { + "epoch": 2.745711234800666, + "grad_norm": 8.387639045715332, + "learning_rate": 4.2381460866555694e-06, + "loss": 2.5393, + "step": 8832500 + }, + { + "epoch": 2.745866667081153, + "grad_norm": 20.93907928466797, + "learning_rate": 4.235555548647455e-06, + "loss": 2.6062, + "step": 8833000 + }, + { + "epoch": 2.7460220993616398, + "grad_norm": 11.76702880859375, + "learning_rate": 4.23296501063934e-06, + "loss": 2.5343, + "step": 8833500 + }, + { + "epoch": 2.7461775316421266, + "grad_norm": 14.410260200500488, + "learning_rate": 4.230374472631225e-06, + "loss": 2.6041, + "step": 8834000 + }, + { + "epoch": 2.7463329639226135, + "grad_norm": 11.306879997253418, + "learning_rate": 4.22778393462311e-06, + "loss": 2.5578, + "step": 8834500 + }, + { + "epoch": 2.7464883962031004, + "grad_norm": 10.845063209533691, + "learning_rate": 4.225193396614996e-06, + "loss": 2.5789, + "step": 8835000 + }, + { + "epoch": 2.7466438284835872, + "grad_norm": 7.653181076049805, + "learning_rate": 4.222602858606881e-06, + "loss": 2.5797, + "step": 8835500 + }, + { + "epoch": 2.746799260764074, + "grad_norm": 34.15044021606445, + "learning_rate": 4.220012320598767e-06, + "loss": 2.5657, + "step": 8836000 + }, + { + "epoch": 2.746954693044561, + "grad_norm": 11.256248474121094, + "learning_rate": 4.217421782590652e-06, + "loss": 2.5584, + "step": 8836500 + }, + { + "epoch": 2.747110125325048, + "grad_norm": 10.945734024047852, + "learning_rate": 4.214831244582538e-06, + "loss": 2.5473, + "step": 8837000 + }, + { + "epoch": 2.7472655576055347, + "grad_norm": 9.573127746582031, + "learning_rate": 4.212240706574423e-06, + "loss": 2.6058, + "step": 8837500 + }, + { + "epoch": 2.7474209898860216, + "grad_norm": 11.769042015075684, + "learning_rate": 4.209650168566309e-06, + "loss": 2.5839, + "step": 8838000 + }, + { + "epoch": 2.7475764221665084, + "grad_norm": 11.222372055053711, + "learning_rate": 4.207059630558194e-06, + "loss": 2.5577, + "step": 8838500 + }, + { + "epoch": 2.7477318544469953, + "grad_norm": 9.392078399658203, + "learning_rate": 4.2044690925500796e-06, + "loss": 2.6019, + "step": 8839000 + }, + { + "epoch": 2.747887286727482, + "grad_norm": 13.352828979492188, + "learning_rate": 4.201878554541964e-06, + "loss": 2.6178, + "step": 8839500 + }, + { + "epoch": 2.748042719007969, + "grad_norm": 12.110712051391602, + "learning_rate": 4.19928801653385e-06, + "loss": 2.577, + "step": 8840000 + }, + { + "epoch": 2.748198151288456, + "grad_norm": 11.270709037780762, + "learning_rate": 4.196697478525735e-06, + "loss": 2.5901, + "step": 8840500 + }, + { + "epoch": 2.7483535835689428, + "grad_norm": 10.11793041229248, + "learning_rate": 4.1941069405176205e-06, + "loss": 2.5262, + "step": 8841000 + }, + { + "epoch": 2.7485090158494296, + "grad_norm": 12.793214797973633, + "learning_rate": 4.191516402509507e-06, + "loss": 2.5965, + "step": 8841500 + }, + { + "epoch": 2.7486644481299165, + "grad_norm": 13.220940589904785, + "learning_rate": 4.1889258645013915e-06, + "loss": 2.6313, + "step": 8842000 + }, + { + "epoch": 2.7488198804104034, + "grad_norm": 8.922088623046875, + "learning_rate": 4.186335326493277e-06, + "loss": 2.5787, + "step": 8842500 + }, + { + "epoch": 2.7489753126908902, + "grad_norm": 17.627962112426758, + "learning_rate": 4.183744788485162e-06, + "loss": 2.6032, + "step": 8843000 + }, + { + "epoch": 2.749130744971377, + "grad_norm": 8.915068626403809, + "learning_rate": 4.181154250477048e-06, + "loss": 2.5374, + "step": 8843500 + }, + { + "epoch": 2.749286177251864, + "grad_norm": 10.841893196105957, + "learning_rate": 4.178563712468933e-06, + "loss": 2.5902, + "step": 8844000 + }, + { + "epoch": 2.749441609532351, + "grad_norm": 11.282668113708496, + "learning_rate": 4.175973174460819e-06, + "loss": 2.5977, + "step": 8844500 + }, + { + "epoch": 2.7495970418128377, + "grad_norm": 12.836417198181152, + "learning_rate": 4.173382636452703e-06, + "loss": 2.5795, + "step": 8845000 + }, + { + "epoch": 2.7497524740933246, + "grad_norm": 21.77410316467285, + "learning_rate": 4.170792098444589e-06, + "loss": 2.6088, + "step": 8845500 + }, + { + "epoch": 2.7499079063738114, + "grad_norm": 12.041716575622559, + "learning_rate": 4.168201560436475e-06, + "loss": 2.5956, + "step": 8846000 + }, + { + "epoch": 2.7500633386542983, + "grad_norm": 11.096090316772461, + "learning_rate": 4.165611022428361e-06, + "loss": 2.5812, + "step": 8846500 + }, + { + "epoch": 2.750218770934785, + "grad_norm": 9.74467658996582, + "learning_rate": 4.163020484420246e-06, + "loss": 2.5206, + "step": 8847000 + }, + { + "epoch": 2.750374203215272, + "grad_norm": 8.834026336669922, + "learning_rate": 4.160429946412131e-06, + "loss": 2.5577, + "step": 8847500 + }, + { + "epoch": 2.750529635495759, + "grad_norm": 9.199724197387695, + "learning_rate": 4.157839408404016e-06, + "loss": 2.5887, + "step": 8848000 + }, + { + "epoch": 2.7506850677762458, + "grad_norm": 11.093640327453613, + "learning_rate": 4.155248870395902e-06, + "loss": 2.56, + "step": 8848500 + }, + { + "epoch": 2.7508405000567326, + "grad_norm": 10.51749038696289, + "learning_rate": 4.152658332387787e-06, + "loss": 2.5541, + "step": 8849000 + }, + { + "epoch": 2.7509959323372195, + "grad_norm": 15.352462768554688, + "learning_rate": 4.1500677943796725e-06, + "loss": 2.6551, + "step": 8849500 + }, + { + "epoch": 2.7511513646177064, + "grad_norm": 10.541808128356934, + "learning_rate": 4.147477256371558e-06, + "loss": 2.5756, + "step": 8850000 + }, + { + "epoch": 2.7513067968981932, + "grad_norm": 9.896324157714844, + "learning_rate": 4.144886718363443e-06, + "loss": 2.5426, + "step": 8850500 + }, + { + "epoch": 2.7514622291786806, + "grad_norm": 7.483187198638916, + "learning_rate": 4.142296180355329e-06, + "loss": 2.5542, + "step": 8851000 + }, + { + "epoch": 2.751617661459167, + "grad_norm": 8.69493579864502, + "learning_rate": 4.139705642347214e-06, + "loss": 2.6189, + "step": 8851500 + }, + { + "epoch": 2.7517730937396543, + "grad_norm": 13.156938552856445, + "learning_rate": 4.1371151043391e-06, + "loss": 2.58, + "step": 8852000 + }, + { + "epoch": 2.7519285260201407, + "grad_norm": 10.31033992767334, + "learning_rate": 4.134524566330985e-06, + "loss": 2.5466, + "step": 8852500 + }, + { + "epoch": 2.752083958300628, + "grad_norm": 10.257204055786133, + "learning_rate": 4.13193402832287e-06, + "loss": 2.5656, + "step": 8853000 + }, + { + "epoch": 2.7522393905811144, + "grad_norm": 9.94948673248291, + "learning_rate": 4.129343490314755e-06, + "loss": 2.5821, + "step": 8853500 + }, + { + "epoch": 2.7523948228616018, + "grad_norm": 10.09221363067627, + "learning_rate": 4.126752952306641e-06, + "loss": 2.5369, + "step": 8854000 + }, + { + "epoch": 2.752550255142088, + "grad_norm": 9.978741645812988, + "learning_rate": 4.124162414298526e-06, + "loss": 2.5382, + "step": 8854500 + }, + { + "epoch": 2.7527056874225755, + "grad_norm": 9.7691011428833, + "learning_rate": 4.1215718762904126e-06, + "loss": 2.5636, + "step": 8855000 + }, + { + "epoch": 2.752861119703062, + "grad_norm": 42.34390640258789, + "learning_rate": 4.118981338282297e-06, + "loss": 2.5672, + "step": 8855500 + }, + { + "epoch": 2.753016551983549, + "grad_norm": 10.16849422454834, + "learning_rate": 4.116390800274183e-06, + "loss": 2.5618, + "step": 8856000 + }, + { + "epoch": 2.753171984264036, + "grad_norm": 6.331061840057373, + "learning_rate": 4.113800262266068e-06, + "loss": 2.5699, + "step": 8856500 + }, + { + "epoch": 2.753327416544523, + "grad_norm": 11.944804191589355, + "learning_rate": 4.1112097242579535e-06, + "loss": 2.6153, + "step": 8857000 + }, + { + "epoch": 2.75348284882501, + "grad_norm": 8.643664360046387, + "learning_rate": 4.108619186249839e-06, + "loss": 2.6095, + "step": 8857500 + }, + { + "epoch": 2.7536382811054967, + "grad_norm": 10.409786224365234, + "learning_rate": 4.1060286482417245e-06, + "loss": 2.5386, + "step": 8858000 + }, + { + "epoch": 2.7537937133859836, + "grad_norm": 9.590499877929688, + "learning_rate": 4.103438110233609e-06, + "loss": 2.5731, + "step": 8858500 + }, + { + "epoch": 2.7539491456664704, + "grad_norm": 6.885200023651123, + "learning_rate": 4.1008475722254945e-06, + "loss": 2.6038, + "step": 8859000 + }, + { + "epoch": 2.7541045779469573, + "grad_norm": 9.213530540466309, + "learning_rate": 4.098257034217381e-06, + "loss": 2.581, + "step": 8859500 + }, + { + "epoch": 2.754260010227444, + "grad_norm": 12.002130508422852, + "learning_rate": 4.095666496209266e-06, + "loss": 2.5284, + "step": 8860000 + }, + { + "epoch": 2.754415442507931, + "grad_norm": 9.321035385131836, + "learning_rate": 4.093075958201152e-06, + "loss": 2.5784, + "step": 8860500 + }, + { + "epoch": 2.754570874788418, + "grad_norm": 9.822332382202148, + "learning_rate": 4.090485420193036e-06, + "loss": 2.5536, + "step": 8861000 + }, + { + "epoch": 2.7547263070689048, + "grad_norm": 9.314834594726562, + "learning_rate": 4.087894882184922e-06, + "loss": 2.6194, + "step": 8861500 + }, + { + "epoch": 2.7548817393493916, + "grad_norm": 10.118818283081055, + "learning_rate": 4.085304344176807e-06, + "loss": 2.587, + "step": 8862000 + }, + { + "epoch": 2.7550371716298785, + "grad_norm": 8.517928123474121, + "learning_rate": 4.082713806168693e-06, + "loss": 2.6201, + "step": 8862500 + }, + { + "epoch": 2.7551926039103654, + "grad_norm": 10.551444053649902, + "learning_rate": 4.080123268160578e-06, + "loss": 2.5552, + "step": 8863000 + }, + { + "epoch": 2.7553480361908522, + "grad_norm": 19.7164306640625, + "learning_rate": 4.077532730152464e-06, + "loss": 2.5471, + "step": 8863500 + }, + { + "epoch": 2.755503468471339, + "grad_norm": 20.894140243530273, + "learning_rate": 4.074942192144349e-06, + "loss": 2.54, + "step": 8864000 + }, + { + "epoch": 2.755658900751826, + "grad_norm": 10.35657787322998, + "learning_rate": 4.072351654136235e-06, + "loss": 2.569, + "step": 8864500 + }, + { + "epoch": 2.755814333032313, + "grad_norm": 13.636388778686523, + "learning_rate": 4.06976111612812e-06, + "loss": 2.5422, + "step": 8865000 + }, + { + "epoch": 2.7559697653127997, + "grad_norm": 8.246617317199707, + "learning_rate": 4.0671705781200055e-06, + "loss": 2.5814, + "step": 8865500 + }, + { + "epoch": 2.7561251975932866, + "grad_norm": 8.982048034667969, + "learning_rate": 4.064580040111891e-06, + "loss": 2.5776, + "step": 8866000 + }, + { + "epoch": 2.7562806298737734, + "grad_norm": 10.357933044433594, + "learning_rate": 4.061989502103776e-06, + "loss": 2.573, + "step": 8866500 + }, + { + "epoch": 2.7564360621542603, + "grad_norm": 9.173969268798828, + "learning_rate": 4.059398964095661e-06, + "loss": 2.5879, + "step": 8867000 + }, + { + "epoch": 2.756591494434747, + "grad_norm": 9.324481010437012, + "learning_rate": 4.0568084260875465e-06, + "loss": 2.6037, + "step": 8867500 + }, + { + "epoch": 2.756746926715234, + "grad_norm": 34.77262496948242, + "learning_rate": 4.054217888079433e-06, + "loss": 2.5357, + "step": 8868000 + }, + { + "epoch": 2.756902358995721, + "grad_norm": 10.413207054138184, + "learning_rate": 4.051627350071318e-06, + "loss": 2.5671, + "step": 8868500 + }, + { + "epoch": 2.7570577912762078, + "grad_norm": 8.775330543518066, + "learning_rate": 4.049036812063203e-06, + "loss": 2.5659, + "step": 8869000 + }, + { + "epoch": 2.7572132235566946, + "grad_norm": 9.398386001586914, + "learning_rate": 4.046446274055088e-06, + "loss": 2.5723, + "step": 8869500 + }, + { + "epoch": 2.7573686558371815, + "grad_norm": 8.636253356933594, + "learning_rate": 4.043855736046974e-06, + "loss": 2.586, + "step": 8870000 + }, + { + "epoch": 2.7575240881176684, + "grad_norm": 9.163052558898926, + "learning_rate": 4.041265198038859e-06, + "loss": 2.5661, + "step": 8870500 + }, + { + "epoch": 2.7576795203981552, + "grad_norm": 10.058584213256836, + "learning_rate": 4.038674660030745e-06, + "loss": 2.5905, + "step": 8871000 + }, + { + "epoch": 2.757834952678642, + "grad_norm": 8.856649398803711, + "learning_rate": 4.03608412202263e-06, + "loss": 2.5396, + "step": 8871500 + }, + { + "epoch": 2.757990384959129, + "grad_norm": 9.5145845413208, + "learning_rate": 4.033493584014516e-06, + "loss": 2.5129, + "step": 8872000 + }, + { + "epoch": 2.758145817239616, + "grad_norm": 9.02550983428955, + "learning_rate": 4.030903046006401e-06, + "loss": 2.5828, + "step": 8872500 + }, + { + "epoch": 2.7583012495201027, + "grad_norm": 11.59213924407959, + "learning_rate": 4.0283125079982865e-06, + "loss": 2.5748, + "step": 8873000 + }, + { + "epoch": 2.7584566818005896, + "grad_norm": 22.17816925048828, + "learning_rate": 4.025721969990172e-06, + "loss": 2.5997, + "step": 8873500 + }, + { + "epoch": 2.7586121140810764, + "grad_norm": 41.427696228027344, + "learning_rate": 4.0231314319820575e-06, + "loss": 2.618, + "step": 8874000 + }, + { + "epoch": 2.7587675463615637, + "grad_norm": 11.927318572998047, + "learning_rate": 4.020540893973943e-06, + "loss": 2.5664, + "step": 8874500 + }, + { + "epoch": 2.75892297864205, + "grad_norm": 9.025847434997559, + "learning_rate": 4.0179503559658275e-06, + "loss": 2.599, + "step": 8875000 + }, + { + "epoch": 2.7590784109225375, + "grad_norm": 35.68826675415039, + "learning_rate": 4.015359817957713e-06, + "loss": 2.5903, + "step": 8875500 + }, + { + "epoch": 2.759233843203024, + "grad_norm": 8.581951141357422, + "learning_rate": 4.0127692799495985e-06, + "loss": 2.57, + "step": 8876000 + }, + { + "epoch": 2.759389275483511, + "grad_norm": 15.827627182006836, + "learning_rate": 4.010178741941484e-06, + "loss": 2.5724, + "step": 8876500 + }, + { + "epoch": 2.7595447077639976, + "grad_norm": 9.017580032348633, + "learning_rate": 4.00758820393337e-06, + "loss": 2.5431, + "step": 8877000 + }, + { + "epoch": 2.759700140044485, + "grad_norm": 11.927164077758789, + "learning_rate": 4.004997665925255e-06, + "loss": 2.5555, + "step": 8877500 + }, + { + "epoch": 2.7598555723249714, + "grad_norm": 10.11070442199707, + "learning_rate": 4.00240712791714e-06, + "loss": 2.5586, + "step": 8878000 + }, + { + "epoch": 2.7600110046054587, + "grad_norm": 9.106185913085938, + "learning_rate": 3.999816589909026e-06, + "loss": 2.6103, + "step": 8878500 + }, + { + "epoch": 2.760166436885945, + "grad_norm": 12.807356834411621, + "learning_rate": 3.997226051900911e-06, + "loss": 2.5481, + "step": 8879000 + }, + { + "epoch": 2.7603218691664324, + "grad_norm": 9.489652633666992, + "learning_rate": 3.994635513892797e-06, + "loss": 2.5467, + "step": 8879500 + }, + { + "epoch": 2.760477301446919, + "grad_norm": 32.379329681396484, + "learning_rate": 3.992044975884682e-06, + "loss": 2.5428, + "step": 8880000 + }, + { + "epoch": 2.760632733727406, + "grad_norm": 14.461333274841309, + "learning_rate": 3.989454437876567e-06, + "loss": 2.5773, + "step": 8880500 + }, + { + "epoch": 2.760788166007893, + "grad_norm": 28.297588348388672, + "learning_rate": 3.986863899868452e-06, + "loss": 2.5607, + "step": 8881000 + }, + { + "epoch": 2.76094359828838, + "grad_norm": 10.920271873474121, + "learning_rate": 3.9842733618603385e-06, + "loss": 2.5591, + "step": 8881500 + }, + { + "epoch": 2.7610990305688667, + "grad_norm": 8.456538200378418, + "learning_rate": 3.981682823852224e-06, + "loss": 2.5307, + "step": 8882000 + }, + { + "epoch": 2.7612544628493536, + "grad_norm": 8.994375228881836, + "learning_rate": 3.9790922858441094e-06, + "loss": 2.5477, + "step": 8882500 + }, + { + "epoch": 2.7614098951298405, + "grad_norm": 11.844246864318848, + "learning_rate": 3.976501747835994e-06, + "loss": 2.5705, + "step": 8883000 + }, + { + "epoch": 2.7615653274103273, + "grad_norm": 13.92089557647705, + "learning_rate": 3.9739112098278795e-06, + "loss": 2.5828, + "step": 8883500 + }, + { + "epoch": 2.761720759690814, + "grad_norm": 37.53730392456055, + "learning_rate": 3.971320671819765e-06, + "loss": 2.5853, + "step": 8884000 + }, + { + "epoch": 2.761876191971301, + "grad_norm": 14.7269868850708, + "learning_rate": 3.96873013381165e-06, + "loss": 2.5749, + "step": 8884500 + }, + { + "epoch": 2.762031624251788, + "grad_norm": 10.049737930297852, + "learning_rate": 3.966139595803536e-06, + "loss": 2.6015, + "step": 8885000 + }, + { + "epoch": 2.762187056532275, + "grad_norm": 13.16279125213623, + "learning_rate": 3.963549057795421e-06, + "loss": 2.5285, + "step": 8885500 + }, + { + "epoch": 2.7623424888127617, + "grad_norm": 9.933688163757324, + "learning_rate": 3.960958519787307e-06, + "loss": 2.5776, + "step": 8886000 + }, + { + "epoch": 2.7624979210932485, + "grad_norm": 13.107501983642578, + "learning_rate": 3.958367981779192e-06, + "loss": 2.571, + "step": 8886500 + }, + { + "epoch": 2.7626533533737354, + "grad_norm": 9.069334983825684, + "learning_rate": 3.955777443771078e-06, + "loss": 2.5963, + "step": 8887000 + }, + { + "epoch": 2.7628087856542223, + "grad_norm": 37.98514175415039, + "learning_rate": 3.953186905762963e-06, + "loss": 2.5283, + "step": 8887500 + }, + { + "epoch": 2.762964217934709, + "grad_norm": 11.1737060546875, + "learning_rate": 3.950596367754849e-06, + "loss": 2.5709, + "step": 8888000 + }, + { + "epoch": 2.763119650215196, + "grad_norm": 9.664578437805176, + "learning_rate": 3.948005829746733e-06, + "loss": 2.6121, + "step": 8888500 + }, + { + "epoch": 2.763275082495683, + "grad_norm": 8.994400024414062, + "learning_rate": 3.945415291738619e-06, + "loss": 2.613, + "step": 8889000 + }, + { + "epoch": 2.7634305147761697, + "grad_norm": 14.384316444396973, + "learning_rate": 3.942824753730504e-06, + "loss": 2.5819, + "step": 8889500 + }, + { + "epoch": 2.7635859470566566, + "grad_norm": 10.406821250915527, + "learning_rate": 3.94023421572239e-06, + "loss": 2.5925, + "step": 8890000 + }, + { + "epoch": 2.7637413793371435, + "grad_norm": 9.917696952819824, + "learning_rate": 3.937643677714276e-06, + "loss": 2.6125, + "step": 8890500 + }, + { + "epoch": 2.7638968116176303, + "grad_norm": 8.427663803100586, + "learning_rate": 3.9350531397061605e-06, + "loss": 2.5543, + "step": 8891000 + }, + { + "epoch": 2.764052243898117, + "grad_norm": 10.207192420959473, + "learning_rate": 3.932462601698046e-06, + "loss": 2.626, + "step": 8891500 + }, + { + "epoch": 2.764207676178604, + "grad_norm": 14.685416221618652, + "learning_rate": 3.9298720636899315e-06, + "loss": 2.5797, + "step": 8892000 + }, + { + "epoch": 2.764363108459091, + "grad_norm": 11.209840774536133, + "learning_rate": 3.927281525681817e-06, + "loss": 2.5374, + "step": 8892500 + }, + { + "epoch": 2.764518540739578, + "grad_norm": 13.800183296203613, + "learning_rate": 3.924690987673702e-06, + "loss": 2.5989, + "step": 8893000 + }, + { + "epoch": 2.7646739730200647, + "grad_norm": 8.882505416870117, + "learning_rate": 3.922100449665588e-06, + "loss": 2.5853, + "step": 8893500 + }, + { + "epoch": 2.7648294053005515, + "grad_norm": 11.971192359924316, + "learning_rate": 3.9195099116574725e-06, + "loss": 2.571, + "step": 8894000 + }, + { + "epoch": 2.7649848375810384, + "grad_norm": 11.081512451171875, + "learning_rate": 3.916919373649358e-06, + "loss": 2.5621, + "step": 8894500 + }, + { + "epoch": 2.7651402698615253, + "grad_norm": 9.085956573486328, + "learning_rate": 3.914328835641244e-06, + "loss": 2.5614, + "step": 8895000 + }, + { + "epoch": 2.765295702142012, + "grad_norm": 10.920188903808594, + "learning_rate": 3.91173829763313e-06, + "loss": 2.5497, + "step": 8895500 + }, + { + "epoch": 2.765451134422499, + "grad_norm": 6.667335033416748, + "learning_rate": 3.909147759625015e-06, + "loss": 2.5603, + "step": 8896000 + }, + { + "epoch": 2.765606566702986, + "grad_norm": 10.034854888916016, + "learning_rate": 3.9065572216169e-06, + "loss": 2.542, + "step": 8896500 + }, + { + "epoch": 2.7657619989834727, + "grad_norm": 9.61808967590332, + "learning_rate": 3.903966683608785e-06, + "loss": 2.5307, + "step": 8897000 + }, + { + "epoch": 2.7659174312639596, + "grad_norm": 11.14747142791748, + "learning_rate": 3.901376145600671e-06, + "loss": 2.6111, + "step": 8897500 + }, + { + "epoch": 2.7660728635444465, + "grad_norm": 11.893034934997559, + "learning_rate": 3.898785607592556e-06, + "loss": 2.5612, + "step": 8898000 + }, + { + "epoch": 2.7662282958249333, + "grad_norm": 27.766258239746094, + "learning_rate": 3.896195069584442e-06, + "loss": 2.5475, + "step": 8898500 + }, + { + "epoch": 2.7663837281054207, + "grad_norm": 8.398362159729004, + "learning_rate": 3.893604531576327e-06, + "loss": 2.6034, + "step": 8899000 + }, + { + "epoch": 2.766539160385907, + "grad_norm": 10.550000190734863, + "learning_rate": 3.8910139935682125e-06, + "loss": 2.5182, + "step": 8899500 + }, + { + "epoch": 2.7666945926663944, + "grad_norm": 13.176681518554688, + "learning_rate": 3.888423455560098e-06, + "loss": 2.6156, + "step": 8900000 + }, + { + "epoch": 2.766850024946881, + "grad_norm": 13.063270568847656, + "learning_rate": 3.885832917551983e-06, + "loss": 2.5657, + "step": 8900500 + }, + { + "epoch": 2.767005457227368, + "grad_norm": 92.20024108886719, + "learning_rate": 3.883242379543869e-06, + "loss": 2.5888, + "step": 8901000 + }, + { + "epoch": 2.7671608895078545, + "grad_norm": 9.755678176879883, + "learning_rate": 3.880651841535754e-06, + "loss": 2.5678, + "step": 8901500 + }, + { + "epoch": 2.767316321788342, + "grad_norm": 13.480125427246094, + "learning_rate": 3.878061303527639e-06, + "loss": 2.6151, + "step": 8902000 + }, + { + "epoch": 2.7674717540688283, + "grad_norm": 19.723281860351562, + "learning_rate": 3.875470765519524e-06, + "loss": 2.5819, + "step": 8902500 + }, + { + "epoch": 2.7676271863493156, + "grad_norm": 8.724456787109375, + "learning_rate": 3.87288022751141e-06, + "loss": 2.5349, + "step": 8903000 + }, + { + "epoch": 2.767782618629802, + "grad_norm": 9.484245300292969, + "learning_rate": 3.870289689503295e-06, + "loss": 2.5675, + "step": 8903500 + }, + { + "epoch": 2.7679380509102893, + "grad_norm": 42.2684440612793, + "learning_rate": 3.867699151495182e-06, + "loss": 2.5164, + "step": 8904000 + }, + { + "epoch": 2.7680934831907757, + "grad_norm": 12.068793296813965, + "learning_rate": 3.865108613487066e-06, + "loss": 2.5662, + "step": 8904500 + }, + { + "epoch": 2.768248915471263, + "grad_norm": 10.015169143676758, + "learning_rate": 3.862518075478952e-06, + "loss": 2.572, + "step": 8905000 + }, + { + "epoch": 2.76840434775175, + "grad_norm": 9.782103538513184, + "learning_rate": 3.859927537470837e-06, + "loss": 2.5861, + "step": 8905500 + }, + { + "epoch": 2.768559780032237, + "grad_norm": 10.887751579284668, + "learning_rate": 3.857336999462723e-06, + "loss": 2.5984, + "step": 8906000 + }, + { + "epoch": 2.7687152123127237, + "grad_norm": 7.729807376861572, + "learning_rate": 3.854746461454608e-06, + "loss": 2.546, + "step": 8906500 + }, + { + "epoch": 2.7688706445932105, + "grad_norm": 14.475388526916504, + "learning_rate": 3.8521559234464935e-06, + "loss": 2.5906, + "step": 8907000 + }, + { + "epoch": 2.7690260768736974, + "grad_norm": 10.7151460647583, + "learning_rate": 3.849565385438378e-06, + "loss": 2.632, + "step": 8907500 + }, + { + "epoch": 2.7691815091541843, + "grad_norm": 11.988731384277344, + "learning_rate": 3.846974847430264e-06, + "loss": 2.56, + "step": 8908000 + }, + { + "epoch": 2.769336941434671, + "grad_norm": 11.061319351196289, + "learning_rate": 3.84438430942215e-06, + "loss": 2.5798, + "step": 8908500 + }, + { + "epoch": 2.769492373715158, + "grad_norm": 13.093205451965332, + "learning_rate": 3.841793771414035e-06, + "loss": 2.5253, + "step": 8909000 + }, + { + "epoch": 2.769647805995645, + "grad_norm": 37.14911651611328, + "learning_rate": 3.839203233405921e-06, + "loss": 2.5706, + "step": 8909500 + }, + { + "epoch": 2.7698032382761317, + "grad_norm": 9.217652320861816, + "learning_rate": 3.8366126953978055e-06, + "loss": 2.5634, + "step": 8910000 + }, + { + "epoch": 2.7699586705566186, + "grad_norm": 10.719766616821289, + "learning_rate": 3.834022157389691e-06, + "loss": 2.5374, + "step": 8910500 + }, + { + "epoch": 2.7701141028371055, + "grad_norm": 11.35690975189209, + "learning_rate": 3.831431619381576e-06, + "loss": 2.5648, + "step": 8911000 + }, + { + "epoch": 2.7702695351175923, + "grad_norm": 10.484770774841309, + "learning_rate": 3.828841081373462e-06, + "loss": 2.5673, + "step": 8911500 + }, + { + "epoch": 2.770424967398079, + "grad_norm": 6.846555233001709, + "learning_rate": 3.826250543365347e-06, + "loss": 2.5984, + "step": 8912000 + }, + { + "epoch": 2.770580399678566, + "grad_norm": 7.8123860359191895, + "learning_rate": 3.823660005357233e-06, + "loss": 2.5651, + "step": 8912500 + }, + { + "epoch": 2.770735831959053, + "grad_norm": 10.643349647521973, + "learning_rate": 3.821069467349118e-06, + "loss": 2.5816, + "step": 8913000 + }, + { + "epoch": 2.77089126423954, + "grad_norm": 9.884709358215332, + "learning_rate": 3.818478929341004e-06, + "loss": 2.613, + "step": 8913500 + }, + { + "epoch": 2.7710466965200267, + "grad_norm": 10.753605842590332, + "learning_rate": 3.815888391332889e-06, + "loss": 2.5716, + "step": 8914000 + }, + { + "epoch": 2.7712021288005135, + "grad_norm": 12.175786018371582, + "learning_rate": 3.8132978533247746e-06, + "loss": 2.5806, + "step": 8914500 + }, + { + "epoch": 2.7713575610810004, + "grad_norm": 11.247928619384766, + "learning_rate": 3.8107073153166596e-06, + "loss": 2.5816, + "step": 8915000 + }, + { + "epoch": 2.7715129933614873, + "grad_norm": 11.409966468811035, + "learning_rate": 3.808116777308545e-06, + "loss": 2.5803, + "step": 8915500 + }, + { + "epoch": 2.771668425641974, + "grad_norm": 9.382481575012207, + "learning_rate": 3.8055262393004305e-06, + "loss": 2.5719, + "step": 8916000 + }, + { + "epoch": 2.771823857922461, + "grad_norm": 12.253747940063477, + "learning_rate": 3.8029357012923156e-06, + "loss": 2.564, + "step": 8916500 + }, + { + "epoch": 2.771979290202948, + "grad_norm": 10.12362289428711, + "learning_rate": 3.800345163284201e-06, + "loss": 2.6008, + "step": 8917000 + }, + { + "epoch": 2.7721347224834347, + "grad_norm": 7.859279632568359, + "learning_rate": 3.797754625276087e-06, + "loss": 2.5303, + "step": 8917500 + }, + { + "epoch": 2.7722901547639216, + "grad_norm": 13.775951385498047, + "learning_rate": 3.7951640872679724e-06, + "loss": 2.5719, + "step": 8918000 + }, + { + "epoch": 2.7724455870444085, + "grad_norm": 13.512101173400879, + "learning_rate": 3.792573549259858e-06, + "loss": 2.5688, + "step": 8918500 + }, + { + "epoch": 2.7726010193248953, + "grad_norm": 8.765182495117188, + "learning_rate": 3.789983011251743e-06, + "loss": 2.615, + "step": 8919000 + }, + { + "epoch": 2.772756451605382, + "grad_norm": 28.47557830810547, + "learning_rate": 3.7873924732436283e-06, + "loss": 2.5778, + "step": 8919500 + }, + { + "epoch": 2.772911883885869, + "grad_norm": 9.011969566345215, + "learning_rate": 3.784801935235514e-06, + "loss": 2.6108, + "step": 8920000 + }, + { + "epoch": 2.773067316166356, + "grad_norm": 8.488121032714844, + "learning_rate": 3.782211397227399e-06, + "loss": 2.5874, + "step": 8920500 + }, + { + "epoch": 2.773222748446843, + "grad_norm": 11.693982124328613, + "learning_rate": 3.7796208592192843e-06, + "loss": 2.5708, + "step": 8921000 + }, + { + "epoch": 2.7733781807273297, + "grad_norm": 11.45771312713623, + "learning_rate": 3.7770303212111697e-06, + "loss": 2.5476, + "step": 8921500 + }, + { + "epoch": 2.7735336130078165, + "grad_norm": 11.592402458190918, + "learning_rate": 3.7744397832030556e-06, + "loss": 2.5658, + "step": 8922000 + }, + { + "epoch": 2.7736890452883034, + "grad_norm": 9.475942611694336, + "learning_rate": 3.771849245194941e-06, + "loss": 2.5842, + "step": 8922500 + }, + { + "epoch": 2.7738444775687903, + "grad_norm": 10.158472061157227, + "learning_rate": 3.769258707186826e-06, + "loss": 2.548, + "step": 8923000 + }, + { + "epoch": 2.7739999098492776, + "grad_norm": 13.475296974182129, + "learning_rate": 3.7666681691787116e-06, + "loss": 2.5754, + "step": 8923500 + }, + { + "epoch": 2.774155342129764, + "grad_norm": 11.139310836791992, + "learning_rate": 3.764077631170597e-06, + "loss": 2.5712, + "step": 8924000 + }, + { + "epoch": 2.7743107744102513, + "grad_norm": 11.75706958770752, + "learning_rate": 3.761487093162482e-06, + "loss": 2.5167, + "step": 8924500 + }, + { + "epoch": 2.7744662066907377, + "grad_norm": 7.383417129516602, + "learning_rate": 3.7588965551543675e-06, + "loss": 2.5855, + "step": 8925000 + }, + { + "epoch": 2.774621638971225, + "grad_norm": 9.28663158416748, + "learning_rate": 3.756306017146253e-06, + "loss": 2.5328, + "step": 8925500 + }, + { + "epoch": 2.7747770712517115, + "grad_norm": 14.008394241333008, + "learning_rate": 3.753715479138138e-06, + "loss": 2.5788, + "step": 8926000 + }, + { + "epoch": 2.774932503532199, + "grad_norm": 11.21998119354248, + "learning_rate": 3.7511249411300243e-06, + "loss": 2.6036, + "step": 8926500 + }, + { + "epoch": 2.775087935812685, + "grad_norm": 10.142644882202148, + "learning_rate": 3.7485344031219094e-06, + "loss": 2.5219, + "step": 8927000 + }, + { + "epoch": 2.7752433680931725, + "grad_norm": 10.733917236328125, + "learning_rate": 3.745943865113795e-06, + "loss": 2.5708, + "step": 8927500 + }, + { + "epoch": 2.775398800373659, + "grad_norm": 10.318947792053223, + "learning_rate": 3.7433533271056803e-06, + "loss": 2.5825, + "step": 8928000 + }, + { + "epoch": 2.7755542326541462, + "grad_norm": 10.612943649291992, + "learning_rate": 3.7407627890975653e-06, + "loss": 2.5061, + "step": 8928500 + }, + { + "epoch": 2.775709664934633, + "grad_norm": 10.231346130371094, + "learning_rate": 3.738172251089451e-06, + "loss": 2.5595, + "step": 8929000 + }, + { + "epoch": 2.77586509721512, + "grad_norm": 10.132246017456055, + "learning_rate": 3.7355817130813362e-06, + "loss": 2.5986, + "step": 8929500 + }, + { + "epoch": 2.776020529495607, + "grad_norm": 9.291824340820312, + "learning_rate": 3.7329911750732213e-06, + "loss": 2.5804, + "step": 8930000 + }, + { + "epoch": 2.7761759617760937, + "grad_norm": 13.758194923400879, + "learning_rate": 3.7304006370651076e-06, + "loss": 2.6282, + "step": 8930500 + }, + { + "epoch": 2.7763313940565806, + "grad_norm": 11.598102569580078, + "learning_rate": 3.7278100990569926e-06, + "loss": 2.5822, + "step": 8931000 + }, + { + "epoch": 2.7764868263370674, + "grad_norm": 8.369193077087402, + "learning_rate": 3.725219561048878e-06, + "loss": 2.5737, + "step": 8931500 + }, + { + "epoch": 2.7766422586175543, + "grad_norm": 10.914198875427246, + "learning_rate": 3.7226290230407635e-06, + "loss": 2.5684, + "step": 8932000 + }, + { + "epoch": 2.776797690898041, + "grad_norm": 11.035674095153809, + "learning_rate": 3.7200384850326486e-06, + "loss": 2.6423, + "step": 8932500 + }, + { + "epoch": 2.776953123178528, + "grad_norm": 10.005095481872559, + "learning_rate": 3.717447947024534e-06, + "loss": 2.6076, + "step": 8933000 + }, + { + "epoch": 2.777108555459015, + "grad_norm": 9.740547180175781, + "learning_rate": 3.7148574090164195e-06, + "loss": 2.5459, + "step": 8933500 + }, + { + "epoch": 2.777263987739502, + "grad_norm": 11.00848388671875, + "learning_rate": 3.7122668710083045e-06, + "loss": 2.578, + "step": 8934000 + }, + { + "epoch": 2.7774194200199886, + "grad_norm": 42.98212432861328, + "learning_rate": 3.70967633300019e-06, + "loss": 2.552, + "step": 8934500 + }, + { + "epoch": 2.7775748523004755, + "grad_norm": 11.389408111572266, + "learning_rate": 3.707085794992076e-06, + "loss": 2.5632, + "step": 8935000 + }, + { + "epoch": 2.7777302845809624, + "grad_norm": 11.148585319519043, + "learning_rate": 3.7044952569839613e-06, + "loss": 2.5874, + "step": 8935500 + }, + { + "epoch": 2.7778857168614493, + "grad_norm": 23.33571434020996, + "learning_rate": 3.701904718975847e-06, + "loss": 2.6119, + "step": 8936000 + }, + { + "epoch": 2.778041149141936, + "grad_norm": 12.502851486206055, + "learning_rate": 3.699314180967732e-06, + "loss": 2.5335, + "step": 8936500 + }, + { + "epoch": 2.778196581422423, + "grad_norm": 14.829489707946777, + "learning_rate": 3.6967236429596173e-06, + "loss": 2.5763, + "step": 8937000 + }, + { + "epoch": 2.77835201370291, + "grad_norm": 14.871530532836914, + "learning_rate": 3.6941331049515028e-06, + "loss": 2.5358, + "step": 8937500 + }, + { + "epoch": 2.7785074459833967, + "grad_norm": 9.915902137756348, + "learning_rate": 3.6915425669433878e-06, + "loss": 2.5368, + "step": 8938000 + }, + { + "epoch": 2.7786628782638836, + "grad_norm": 10.984674453735352, + "learning_rate": 3.6889520289352732e-06, + "loss": 2.579, + "step": 8938500 + }, + { + "epoch": 2.7788183105443705, + "grad_norm": 5.95891809463501, + "learning_rate": 3.6863614909271587e-06, + "loss": 2.5715, + "step": 8939000 + }, + { + "epoch": 2.7789737428248573, + "grad_norm": 8.676732063293457, + "learning_rate": 3.6837709529190446e-06, + "loss": 2.5517, + "step": 8939500 + }, + { + "epoch": 2.779129175105344, + "grad_norm": 10.292366027832031, + "learning_rate": 3.68118041491093e-06, + "loss": 2.5118, + "step": 8940000 + }, + { + "epoch": 2.779284607385831, + "grad_norm": 8.724541664123535, + "learning_rate": 3.678589876902815e-06, + "loss": 2.5668, + "step": 8940500 + }, + { + "epoch": 2.779440039666318, + "grad_norm": 11.861356735229492, + "learning_rate": 3.6759993388947005e-06, + "loss": 2.5687, + "step": 8941000 + }, + { + "epoch": 2.779595471946805, + "grad_norm": 12.486961364746094, + "learning_rate": 3.673408800886586e-06, + "loss": 2.5736, + "step": 8941500 + }, + { + "epoch": 2.7797509042272917, + "grad_norm": 11.985475540161133, + "learning_rate": 3.670818262878471e-06, + "loss": 2.6079, + "step": 8942000 + }, + { + "epoch": 2.7799063365077785, + "grad_norm": 9.976592063903809, + "learning_rate": 3.6682277248703565e-06, + "loss": 2.5837, + "step": 8942500 + }, + { + "epoch": 2.7800617687882654, + "grad_norm": 13.634180068969727, + "learning_rate": 3.665637186862242e-06, + "loss": 2.6191, + "step": 8943000 + }, + { + "epoch": 2.7802172010687523, + "grad_norm": 9.406847953796387, + "learning_rate": 3.6630466488541274e-06, + "loss": 2.5403, + "step": 8943500 + }, + { + "epoch": 2.780372633349239, + "grad_norm": 12.198229789733887, + "learning_rate": 3.6604561108460133e-06, + "loss": 2.5409, + "step": 8944000 + }, + { + "epoch": 2.780528065629726, + "grad_norm": 9.25287914276123, + "learning_rate": 3.6578655728378988e-06, + "loss": 2.5745, + "step": 8944500 + }, + { + "epoch": 2.780683497910213, + "grad_norm": 17.746917724609375, + "learning_rate": 3.655275034829784e-06, + "loss": 2.5689, + "step": 8945000 + }, + { + "epoch": 2.7808389301906997, + "grad_norm": 9.154093742370605, + "learning_rate": 3.6526844968216693e-06, + "loss": 2.6196, + "step": 8945500 + }, + { + "epoch": 2.7809943624711866, + "grad_norm": 49.77821731567383, + "learning_rate": 3.6500939588135547e-06, + "loss": 2.6013, + "step": 8946000 + }, + { + "epoch": 2.7811497947516735, + "grad_norm": 10.123250961303711, + "learning_rate": 3.6475034208054397e-06, + "loss": 2.5615, + "step": 8946500 + }, + { + "epoch": 2.7813052270321608, + "grad_norm": 8.694485664367676, + "learning_rate": 3.644912882797325e-06, + "loss": 2.5926, + "step": 8947000 + }, + { + "epoch": 2.781460659312647, + "grad_norm": 8.749042510986328, + "learning_rate": 3.6423223447892107e-06, + "loss": 2.5876, + "step": 8947500 + }, + { + "epoch": 2.7816160915931345, + "grad_norm": 11.40623950958252, + "learning_rate": 3.6397318067810957e-06, + "loss": 2.5995, + "step": 8948000 + }, + { + "epoch": 2.781771523873621, + "grad_norm": 9.814652442932129, + "learning_rate": 3.637141268772982e-06, + "loss": 2.5611, + "step": 8948500 + }, + { + "epoch": 2.7819269561541082, + "grad_norm": 12.061809539794922, + "learning_rate": 3.634550730764867e-06, + "loss": 2.5544, + "step": 8949000 + }, + { + "epoch": 2.7820823884345947, + "grad_norm": 11.632983207702637, + "learning_rate": 3.6319601927567525e-06, + "loss": 2.5408, + "step": 8949500 + }, + { + "epoch": 2.782237820715082, + "grad_norm": 9.61341381072998, + "learning_rate": 3.629369654748638e-06, + "loss": 2.5657, + "step": 8950000 + }, + { + "epoch": 2.7823932529955684, + "grad_norm": 12.442383766174316, + "learning_rate": 3.626779116740523e-06, + "loss": 2.5748, + "step": 8950500 + }, + { + "epoch": 2.7825486852760557, + "grad_norm": 10.497207641601562, + "learning_rate": 3.6241885787324085e-06, + "loss": 2.5301, + "step": 8951000 + }, + { + "epoch": 2.782704117556542, + "grad_norm": 21.282636642456055, + "learning_rate": 3.621598040724294e-06, + "loss": 2.5213, + "step": 8951500 + }, + { + "epoch": 2.7828595498370294, + "grad_norm": 9.377758979797363, + "learning_rate": 3.619007502716179e-06, + "loss": 2.5438, + "step": 8952000 + }, + { + "epoch": 2.783014982117516, + "grad_norm": 10.101078033447266, + "learning_rate": 3.6164169647080644e-06, + "loss": 2.5746, + "step": 8952500 + }, + { + "epoch": 2.783170414398003, + "grad_norm": 11.674555778503418, + "learning_rate": 3.6138264266999503e-06, + "loss": 2.5455, + "step": 8953000 + }, + { + "epoch": 2.78332584667849, + "grad_norm": 8.294687271118164, + "learning_rate": 3.6112358886918358e-06, + "loss": 2.5694, + "step": 8953500 + }, + { + "epoch": 2.783481278958977, + "grad_norm": 11.535079002380371, + "learning_rate": 3.6086453506837212e-06, + "loss": 2.545, + "step": 8954000 + }, + { + "epoch": 2.7836367112394638, + "grad_norm": 10.791150093078613, + "learning_rate": 3.6060548126756062e-06, + "loss": 2.5439, + "step": 8954500 + }, + { + "epoch": 2.7837921435199506, + "grad_norm": 10.477070808410645, + "learning_rate": 3.6034642746674917e-06, + "loss": 2.5716, + "step": 8955000 + }, + { + "epoch": 2.7839475758004375, + "grad_norm": 11.365153312683105, + "learning_rate": 3.600873736659377e-06, + "loss": 2.587, + "step": 8955500 + }, + { + "epoch": 2.7841030080809244, + "grad_norm": 18.1633358001709, + "learning_rate": 3.598283198651262e-06, + "loss": 2.5883, + "step": 8956000 + }, + { + "epoch": 2.7842584403614112, + "grad_norm": 9.928980827331543, + "learning_rate": 3.5956926606431477e-06, + "loss": 2.5734, + "step": 8956500 + }, + { + "epoch": 2.784413872641898, + "grad_norm": 8.883749961853027, + "learning_rate": 3.593102122635033e-06, + "loss": 2.5983, + "step": 8957000 + }, + { + "epoch": 2.784569304922385, + "grad_norm": 10.687955856323242, + "learning_rate": 3.590511584626919e-06, + "loss": 2.5762, + "step": 8957500 + }, + { + "epoch": 2.784724737202872, + "grad_norm": 8.25586223602295, + "learning_rate": 3.5879210466188045e-06, + "loss": 2.579, + "step": 8958000 + }, + { + "epoch": 2.7848801694833587, + "grad_norm": 10.924059867858887, + "learning_rate": 3.5853305086106895e-06, + "loss": 2.5683, + "step": 8958500 + }, + { + "epoch": 2.7850356017638456, + "grad_norm": 14.58693790435791, + "learning_rate": 3.582739970602575e-06, + "loss": 2.5932, + "step": 8959000 + }, + { + "epoch": 2.7851910340443324, + "grad_norm": 12.165400505065918, + "learning_rate": 3.5801494325944604e-06, + "loss": 2.5624, + "step": 8959500 + }, + { + "epoch": 2.7853464663248193, + "grad_norm": 9.464614868164062, + "learning_rate": 3.5775588945863455e-06, + "loss": 2.548, + "step": 8960000 + }, + { + "epoch": 2.785501898605306, + "grad_norm": 11.728156089782715, + "learning_rate": 3.574968356578231e-06, + "loss": 2.552, + "step": 8960500 + }, + { + "epoch": 2.785657330885793, + "grad_norm": 11.986050605773926, + "learning_rate": 3.5723778185701164e-06, + "loss": 2.6169, + "step": 8961000 + }, + { + "epoch": 2.78581276316628, + "grad_norm": 10.634612083435059, + "learning_rate": 3.5697872805620014e-06, + "loss": 2.5909, + "step": 8961500 + }, + { + "epoch": 2.7859681954467668, + "grad_norm": 16.696592330932617, + "learning_rate": 3.5671967425538877e-06, + "loss": 2.5769, + "step": 8962000 + }, + { + "epoch": 2.7861236277272536, + "grad_norm": 9.986372947692871, + "learning_rate": 3.5646062045457728e-06, + "loss": 2.5808, + "step": 8962500 + }, + { + "epoch": 2.7862790600077405, + "grad_norm": 9.35216999053955, + "learning_rate": 3.562015666537658e-06, + "loss": 2.5256, + "step": 8963000 + }, + { + "epoch": 2.7864344922882274, + "grad_norm": 7.371523857116699, + "learning_rate": 3.5594251285295437e-06, + "loss": 2.5534, + "step": 8963500 + }, + { + "epoch": 2.7865899245687142, + "grad_norm": 17.333255767822266, + "learning_rate": 3.5568345905214287e-06, + "loss": 2.582, + "step": 8964000 + }, + { + "epoch": 2.786745356849201, + "grad_norm": 9.782825469970703, + "learning_rate": 3.554244052513314e-06, + "loss": 2.5789, + "step": 8964500 + }, + { + "epoch": 2.786900789129688, + "grad_norm": 7.408448696136475, + "learning_rate": 3.5516535145051996e-06, + "loss": 2.5616, + "step": 8965000 + }, + { + "epoch": 2.787056221410175, + "grad_norm": 9.130133628845215, + "learning_rate": 3.5490629764970847e-06, + "loss": 2.6146, + "step": 8965500 + }, + { + "epoch": 2.7872116536906617, + "grad_norm": 13.298810958862305, + "learning_rate": 3.54647243848897e-06, + "loss": 2.562, + "step": 8966000 + }, + { + "epoch": 2.7873670859711486, + "grad_norm": 9.993056297302246, + "learning_rate": 3.543881900480856e-06, + "loss": 2.6206, + "step": 8966500 + }, + { + "epoch": 2.7875225182516354, + "grad_norm": 10.140846252441406, + "learning_rate": 3.5412913624727415e-06, + "loss": 2.5974, + "step": 8967000 + }, + { + "epoch": 2.7876779505321223, + "grad_norm": 9.547442436218262, + "learning_rate": 3.538700824464627e-06, + "loss": 2.5694, + "step": 8967500 + }, + { + "epoch": 2.787833382812609, + "grad_norm": 11.008905410766602, + "learning_rate": 3.536110286456512e-06, + "loss": 2.623, + "step": 8968000 + }, + { + "epoch": 2.787988815093096, + "grad_norm": 9.866621971130371, + "learning_rate": 3.5335197484483974e-06, + "loss": 2.5603, + "step": 8968500 + }, + { + "epoch": 2.788144247373583, + "grad_norm": 13.06369400024414, + "learning_rate": 3.530929210440283e-06, + "loss": 2.5883, + "step": 8969000 + }, + { + "epoch": 2.7882996796540698, + "grad_norm": 11.762113571166992, + "learning_rate": 3.528338672432168e-06, + "loss": 2.5628, + "step": 8969500 + }, + { + "epoch": 2.7884551119345566, + "grad_norm": 8.923236846923828, + "learning_rate": 3.5257481344240534e-06, + "loss": 2.5645, + "step": 8970000 + }, + { + "epoch": 2.7886105442150435, + "grad_norm": 9.037476539611816, + "learning_rate": 3.523157596415939e-06, + "loss": 2.5801, + "step": 8970500 + }, + { + "epoch": 2.7887659764955304, + "grad_norm": 11.690352439880371, + "learning_rate": 3.5205670584078247e-06, + "loss": 2.6031, + "step": 8971000 + }, + { + "epoch": 2.7889214087760177, + "grad_norm": 8.366495132446289, + "learning_rate": 3.51797652039971e-06, + "loss": 2.5868, + "step": 8971500 + }, + { + "epoch": 2.789076841056504, + "grad_norm": 6.5268635749816895, + "learning_rate": 3.515385982391595e-06, + "loss": 2.5483, + "step": 8972000 + }, + { + "epoch": 2.7892322733369914, + "grad_norm": 8.63253402709961, + "learning_rate": 3.5127954443834807e-06, + "loss": 2.5819, + "step": 8972500 + }, + { + "epoch": 2.789387705617478, + "grad_norm": 14.112491607666016, + "learning_rate": 3.510204906375366e-06, + "loss": 2.5281, + "step": 8973000 + }, + { + "epoch": 2.789543137897965, + "grad_norm": 13.80799388885498, + "learning_rate": 3.507614368367251e-06, + "loss": 2.579, + "step": 8973500 + }, + { + "epoch": 2.7896985701784516, + "grad_norm": 9.853581428527832, + "learning_rate": 3.5050238303591366e-06, + "loss": 2.5322, + "step": 8974000 + }, + { + "epoch": 2.789854002458939, + "grad_norm": 11.310441970825195, + "learning_rate": 3.502433292351022e-06, + "loss": 2.5898, + "step": 8974500 + }, + { + "epoch": 2.7900094347394253, + "grad_norm": 7.951057434082031, + "learning_rate": 3.499842754342907e-06, + "loss": 2.5616, + "step": 8975000 + }, + { + "epoch": 2.7901648670199126, + "grad_norm": 11.855278015136719, + "learning_rate": 3.4972522163347934e-06, + "loss": 2.5642, + "step": 8975500 + }, + { + "epoch": 2.790320299300399, + "grad_norm": 9.756629943847656, + "learning_rate": 3.4946616783266785e-06, + "loss": 2.5589, + "step": 8976000 + }, + { + "epoch": 2.7904757315808864, + "grad_norm": 10.871258735656738, + "learning_rate": 3.492071140318564e-06, + "loss": 2.5675, + "step": 8976500 + }, + { + "epoch": 2.790631163861373, + "grad_norm": 14.070393562316895, + "learning_rate": 3.4894806023104494e-06, + "loss": 2.6072, + "step": 8977000 + }, + { + "epoch": 2.79078659614186, + "grad_norm": 10.910611152648926, + "learning_rate": 3.4868900643023344e-06, + "loss": 2.5954, + "step": 8977500 + }, + { + "epoch": 2.790942028422347, + "grad_norm": 13.18208122253418, + "learning_rate": 3.48429952629422e-06, + "loss": 2.546, + "step": 8978000 + }, + { + "epoch": 2.791097460702834, + "grad_norm": 9.067875862121582, + "learning_rate": 3.4817089882861053e-06, + "loss": 2.5528, + "step": 8978500 + }, + { + "epoch": 2.7912528929833207, + "grad_norm": 9.485679626464844, + "learning_rate": 3.4791184502779904e-06, + "loss": 2.5739, + "step": 8979000 + }, + { + "epoch": 2.7914083252638076, + "grad_norm": 9.321615219116211, + "learning_rate": 3.476527912269876e-06, + "loss": 2.543, + "step": 8979500 + }, + { + "epoch": 2.7915637575442944, + "grad_norm": 8.810432434082031, + "learning_rate": 3.4739373742617617e-06, + "loss": 2.5719, + "step": 8980000 + }, + { + "epoch": 2.7917191898247813, + "grad_norm": 11.883565902709961, + "learning_rate": 3.471346836253647e-06, + "loss": 2.5786, + "step": 8980500 + }, + { + "epoch": 2.791874622105268, + "grad_norm": 16.874391555786133, + "learning_rate": 3.4687562982455326e-06, + "loss": 2.6007, + "step": 8981000 + }, + { + "epoch": 2.792030054385755, + "grad_norm": 11.847426414489746, + "learning_rate": 3.4661657602374177e-06, + "loss": 2.6074, + "step": 8981500 + }, + { + "epoch": 2.792185486666242, + "grad_norm": 25.084232330322266, + "learning_rate": 3.463575222229303e-06, + "loss": 2.5595, + "step": 8982000 + }, + { + "epoch": 2.7923409189467288, + "grad_norm": 10.161446571350098, + "learning_rate": 3.4609846842211886e-06, + "loss": 2.592, + "step": 8982500 + }, + { + "epoch": 2.7924963512272156, + "grad_norm": 9.958799362182617, + "learning_rate": 3.4583941462130736e-06, + "loss": 2.5536, + "step": 8983000 + }, + { + "epoch": 2.7926517835077025, + "grad_norm": 9.683754920959473, + "learning_rate": 3.455803608204959e-06, + "loss": 2.5833, + "step": 8983500 + }, + { + "epoch": 2.7928072157881894, + "grad_norm": 9.599751472473145, + "learning_rate": 3.4532130701968445e-06, + "loss": 2.5566, + "step": 8984000 + }, + { + "epoch": 2.7929626480686762, + "grad_norm": 10.006877899169922, + "learning_rate": 3.4506225321887304e-06, + "loss": 2.5331, + "step": 8984500 + }, + { + "epoch": 2.793118080349163, + "grad_norm": 24.303319931030273, + "learning_rate": 3.448031994180616e-06, + "loss": 2.5832, + "step": 8985000 + }, + { + "epoch": 2.79327351262965, + "grad_norm": 11.887150764465332, + "learning_rate": 3.445441456172501e-06, + "loss": 2.519, + "step": 8985500 + }, + { + "epoch": 2.793428944910137, + "grad_norm": 14.755193710327148, + "learning_rate": 3.4428509181643864e-06, + "loss": 2.5467, + "step": 8986000 + }, + { + "epoch": 2.7935843771906237, + "grad_norm": 10.051375389099121, + "learning_rate": 3.440260380156272e-06, + "loss": 2.5599, + "step": 8986500 + }, + { + "epoch": 2.7937398094711106, + "grad_norm": 8.84445858001709, + "learning_rate": 3.437669842148157e-06, + "loss": 2.583, + "step": 8987000 + }, + { + "epoch": 2.7938952417515974, + "grad_norm": 12.739253044128418, + "learning_rate": 3.4350793041400423e-06, + "loss": 2.5798, + "step": 8987500 + }, + { + "epoch": 2.7940506740320843, + "grad_norm": 11.145669937133789, + "learning_rate": 3.4324887661319278e-06, + "loss": 2.5929, + "step": 8988000 + }, + { + "epoch": 2.794206106312571, + "grad_norm": 8.799958229064941, + "learning_rate": 3.429898228123813e-06, + "loss": 2.5663, + "step": 8988500 + }, + { + "epoch": 2.794361538593058, + "grad_norm": 9.45361042022705, + "learning_rate": 3.427307690115699e-06, + "loss": 2.5918, + "step": 8989000 + }, + { + "epoch": 2.794516970873545, + "grad_norm": 9.109457969665527, + "learning_rate": 3.424717152107584e-06, + "loss": 2.5157, + "step": 8989500 + }, + { + "epoch": 2.7946724031540318, + "grad_norm": 19.727907180786133, + "learning_rate": 3.4221266140994696e-06, + "loss": 2.5345, + "step": 8990000 + }, + { + "epoch": 2.7948278354345186, + "grad_norm": 11.836751937866211, + "learning_rate": 3.419536076091355e-06, + "loss": 2.5593, + "step": 8990500 + }, + { + "epoch": 2.7949832677150055, + "grad_norm": 10.521775245666504, + "learning_rate": 3.4169455380832405e-06, + "loss": 2.5407, + "step": 8991000 + }, + { + "epoch": 2.7951386999954924, + "grad_norm": 14.69140338897705, + "learning_rate": 3.4143550000751256e-06, + "loss": 2.5645, + "step": 8991500 + }, + { + "epoch": 2.7952941322759792, + "grad_norm": 16.152769088745117, + "learning_rate": 3.411764462067011e-06, + "loss": 2.5577, + "step": 8992000 + }, + { + "epoch": 2.795449564556466, + "grad_norm": 8.11105728149414, + "learning_rate": 3.4091739240588965e-06, + "loss": 2.5706, + "step": 8992500 + }, + { + "epoch": 2.795604996836953, + "grad_norm": 10.800597190856934, + "learning_rate": 3.4065833860507815e-06, + "loss": 2.5879, + "step": 8993000 + }, + { + "epoch": 2.79576042911744, + "grad_norm": 9.090258598327637, + "learning_rate": 3.403992848042668e-06, + "loss": 2.5558, + "step": 8993500 + }, + { + "epoch": 2.7959158613979267, + "grad_norm": 8.770383834838867, + "learning_rate": 3.401402310034553e-06, + "loss": 2.5451, + "step": 8994000 + }, + { + "epoch": 2.7960712936784136, + "grad_norm": 10.976299285888672, + "learning_rate": 3.3988117720264383e-06, + "loss": 2.6017, + "step": 8994500 + }, + { + "epoch": 2.796226725958901, + "grad_norm": 13.751729011535645, + "learning_rate": 3.396221234018324e-06, + "loss": 2.5595, + "step": 8995000 + }, + { + "epoch": 2.7963821582393873, + "grad_norm": 10.636704444885254, + "learning_rate": 3.393630696010209e-06, + "loss": 2.5652, + "step": 8995500 + }, + { + "epoch": 2.7965375905198746, + "grad_norm": 11.110539436340332, + "learning_rate": 3.3910401580020943e-06, + "loss": 2.5578, + "step": 8996000 + }, + { + "epoch": 2.796693022800361, + "grad_norm": 8.41463851928711, + "learning_rate": 3.3884496199939797e-06, + "loss": 2.5641, + "step": 8996500 + }, + { + "epoch": 2.7968484550808483, + "grad_norm": 9.444833755493164, + "learning_rate": 3.3858590819858648e-06, + "loss": 2.523, + "step": 8997000 + }, + { + "epoch": 2.7970038873613348, + "grad_norm": 8.751984596252441, + "learning_rate": 3.383268543977751e-06, + "loss": 2.5196, + "step": 8997500 + }, + { + "epoch": 2.797159319641822, + "grad_norm": 21.84528350830078, + "learning_rate": 3.380678005969636e-06, + "loss": 2.5901, + "step": 8998000 + }, + { + "epoch": 2.7973147519223085, + "grad_norm": 13.225831985473633, + "learning_rate": 3.3780874679615216e-06, + "loss": 2.5181, + "step": 8998500 + }, + { + "epoch": 2.797470184202796, + "grad_norm": 10.442249298095703, + "learning_rate": 3.375496929953407e-06, + "loss": 2.609, + "step": 8999000 + }, + { + "epoch": 2.7976256164832822, + "grad_norm": 13.2993803024292, + "learning_rate": 3.372906391945292e-06, + "loss": 2.541, + "step": 8999500 + }, + { + "epoch": 2.7977810487637695, + "grad_norm": 10.451242446899414, + "learning_rate": 3.3703158539371775e-06, + "loss": 2.5691, + "step": 9000000 + }, + { + "epoch": 2.797936481044256, + "grad_norm": 16.069948196411133, + "learning_rate": 3.367725315929063e-06, + "loss": 2.5731, + "step": 9000500 + }, + { + "epoch": 2.7980919133247433, + "grad_norm": 12.70691204071045, + "learning_rate": 3.365134777920948e-06, + "loss": 2.6091, + "step": 9001000 + }, + { + "epoch": 2.79824734560523, + "grad_norm": 10.882238388061523, + "learning_rate": 3.3625442399128335e-06, + "loss": 2.5857, + "step": 9001500 + }, + { + "epoch": 2.798402777885717, + "grad_norm": 10.6556978225708, + "learning_rate": 3.3599537019047194e-06, + "loss": 2.5771, + "step": 9002000 + }, + { + "epoch": 2.798558210166204, + "grad_norm": 9.226783752441406, + "learning_rate": 3.357363163896605e-06, + "loss": 2.52, + "step": 9002500 + }, + { + "epoch": 2.7987136424466907, + "grad_norm": 10.642023086547852, + "learning_rate": 3.3547726258884903e-06, + "loss": 2.5277, + "step": 9003000 + }, + { + "epoch": 2.7988690747271776, + "grad_norm": 10.826512336730957, + "learning_rate": 3.3521820878803753e-06, + "loss": 2.5595, + "step": 9003500 + }, + { + "epoch": 2.7990245070076645, + "grad_norm": 10.73698616027832, + "learning_rate": 3.349591549872261e-06, + "loss": 2.5687, + "step": 9004000 + }, + { + "epoch": 2.7991799392881513, + "grad_norm": 9.521254539489746, + "learning_rate": 3.3470010118641463e-06, + "loss": 2.5688, + "step": 9004500 + }, + { + "epoch": 2.799335371568638, + "grad_norm": 9.68973445892334, + "learning_rate": 3.3444104738560313e-06, + "loss": 2.5442, + "step": 9005000 + }, + { + "epoch": 2.799490803849125, + "grad_norm": 13.396527290344238, + "learning_rate": 3.3418199358479167e-06, + "loss": 2.563, + "step": 9005500 + }, + { + "epoch": 2.799646236129612, + "grad_norm": 10.077905654907227, + "learning_rate": 3.339229397839802e-06, + "loss": 2.51, + "step": 9006000 + }, + { + "epoch": 2.799801668410099, + "grad_norm": 9.428397178649902, + "learning_rate": 3.336638859831688e-06, + "loss": 2.5228, + "step": 9006500 + }, + { + "epoch": 2.7999571006905857, + "grad_norm": 9.971651077270508, + "learning_rate": 3.3340483218235735e-06, + "loss": 2.5848, + "step": 9007000 + }, + { + "epoch": 2.8001125329710725, + "grad_norm": 18.71134376525879, + "learning_rate": 3.3314577838154586e-06, + "loss": 2.5641, + "step": 9007500 + }, + { + "epoch": 2.8002679652515594, + "grad_norm": 9.053165435791016, + "learning_rate": 3.328867245807344e-06, + "loss": 2.5568, + "step": 9008000 + }, + { + "epoch": 2.8004233975320463, + "grad_norm": 10.994855880737305, + "learning_rate": 3.3262767077992295e-06, + "loss": 2.5925, + "step": 9008500 + }, + { + "epoch": 2.800578829812533, + "grad_norm": 10.109700202941895, + "learning_rate": 3.3236861697911145e-06, + "loss": 2.5477, + "step": 9009000 + }, + { + "epoch": 2.80073426209302, + "grad_norm": 12.236676216125488, + "learning_rate": 3.321095631783e-06, + "loss": 2.5276, + "step": 9009500 + }, + { + "epoch": 2.800889694373507, + "grad_norm": 11.93228816986084, + "learning_rate": 3.3185050937748855e-06, + "loss": 2.5708, + "step": 9010000 + }, + { + "epoch": 2.8010451266539937, + "grad_norm": 8.547941207885742, + "learning_rate": 3.3159145557667705e-06, + "loss": 2.5957, + "step": 9010500 + }, + { + "epoch": 2.8012005589344806, + "grad_norm": 10.684946060180664, + "learning_rate": 3.313324017758657e-06, + "loss": 2.5711, + "step": 9011000 + }, + { + "epoch": 2.8013559912149675, + "grad_norm": 8.611772537231445, + "learning_rate": 3.310733479750542e-06, + "loss": 2.6138, + "step": 9011500 + }, + { + "epoch": 2.8015114234954543, + "grad_norm": 10.278231620788574, + "learning_rate": 3.3081429417424273e-06, + "loss": 2.5752, + "step": 9012000 + }, + { + "epoch": 2.801666855775941, + "grad_norm": 10.604034423828125, + "learning_rate": 3.3055524037343128e-06, + "loss": 2.5521, + "step": 9012500 + }, + { + "epoch": 2.801822288056428, + "grad_norm": 9.738982200622559, + "learning_rate": 3.3029618657261978e-06, + "loss": 2.5557, + "step": 9013000 + }, + { + "epoch": 2.801977720336915, + "grad_norm": 10.61590576171875, + "learning_rate": 3.3003713277180832e-06, + "loss": 2.5918, + "step": 9013500 + }, + { + "epoch": 2.802133152617402, + "grad_norm": 10.646162033081055, + "learning_rate": 3.2977807897099687e-06, + "loss": 2.5688, + "step": 9014000 + }, + { + "epoch": 2.8022885848978887, + "grad_norm": 9.193017959594727, + "learning_rate": 3.2951902517018537e-06, + "loss": 2.5677, + "step": 9014500 + }, + { + "epoch": 2.8024440171783755, + "grad_norm": 11.465683937072754, + "learning_rate": 3.292599713693739e-06, + "loss": 2.5689, + "step": 9015000 + }, + { + "epoch": 2.8025994494588624, + "grad_norm": 13.35472297668457, + "learning_rate": 3.290009175685625e-06, + "loss": 2.5463, + "step": 9015500 + }, + { + "epoch": 2.8027548817393493, + "grad_norm": 10.603944778442383, + "learning_rate": 3.2874186376775105e-06, + "loss": 2.5863, + "step": 9016000 + }, + { + "epoch": 2.802910314019836, + "grad_norm": 9.772083282470703, + "learning_rate": 3.284828099669396e-06, + "loss": 2.5537, + "step": 9016500 + }, + { + "epoch": 2.803065746300323, + "grad_norm": 12.111322402954102, + "learning_rate": 3.282237561661281e-06, + "loss": 2.5661, + "step": 9017000 + }, + { + "epoch": 2.80322117858081, + "grad_norm": 10.505934715270996, + "learning_rate": 3.2796470236531665e-06, + "loss": 2.529, + "step": 9017500 + }, + { + "epoch": 2.8033766108612967, + "grad_norm": 17.983783721923828, + "learning_rate": 3.277056485645052e-06, + "loss": 2.5912, + "step": 9018000 + }, + { + "epoch": 2.8035320431417836, + "grad_norm": 13.637619018554688, + "learning_rate": 3.274465947636937e-06, + "loss": 2.55, + "step": 9018500 + }, + { + "epoch": 2.8036874754222705, + "grad_norm": 8.39404010772705, + "learning_rate": 3.2718754096288225e-06, + "loss": 2.5496, + "step": 9019000 + }, + { + "epoch": 2.803842907702758, + "grad_norm": 19.14055061340332, + "learning_rate": 3.269284871620708e-06, + "loss": 2.5864, + "step": 9019500 + }, + { + "epoch": 2.803998339983244, + "grad_norm": 11.54520034790039, + "learning_rate": 3.266694333612594e-06, + "loss": 2.5715, + "step": 9020000 + }, + { + "epoch": 2.8041537722637315, + "grad_norm": 10.110929489135742, + "learning_rate": 3.2641037956044793e-06, + "loss": 2.5518, + "step": 9020500 + }, + { + "epoch": 2.804309204544218, + "grad_norm": 25.104812622070312, + "learning_rate": 3.2615132575963643e-06, + "loss": 2.588, + "step": 9021000 + }, + { + "epoch": 2.8044646368247053, + "grad_norm": 9.152915954589844, + "learning_rate": 3.2589227195882497e-06, + "loss": 2.5681, + "step": 9021500 + }, + { + "epoch": 2.8046200691051917, + "grad_norm": 10.193718910217285, + "learning_rate": 3.256332181580135e-06, + "loss": 2.5227, + "step": 9022000 + }, + { + "epoch": 2.804775501385679, + "grad_norm": 9.948369979858398, + "learning_rate": 3.2537416435720202e-06, + "loss": 2.5493, + "step": 9022500 + }, + { + "epoch": 2.8049309336661654, + "grad_norm": 12.168473243713379, + "learning_rate": 3.2511511055639057e-06, + "loss": 2.5479, + "step": 9023000 + }, + { + "epoch": 2.8050863659466527, + "grad_norm": 10.00189208984375, + "learning_rate": 3.248560567555791e-06, + "loss": 2.5952, + "step": 9023500 + }, + { + "epoch": 2.805241798227139, + "grad_norm": 7.175137042999268, + "learning_rate": 3.245970029547676e-06, + "loss": 2.5584, + "step": 9024000 + }, + { + "epoch": 2.8053972305076265, + "grad_norm": 11.300262451171875, + "learning_rate": 3.2433794915395625e-06, + "loss": 2.5741, + "step": 9024500 + }, + { + "epoch": 2.8055526627881133, + "grad_norm": 12.577738761901855, + "learning_rate": 3.2407889535314475e-06, + "loss": 2.5716, + "step": 9025000 + }, + { + "epoch": 2.8057080950686, + "grad_norm": 10.360854148864746, + "learning_rate": 3.238198415523333e-06, + "loss": 2.6032, + "step": 9025500 + }, + { + "epoch": 2.805863527349087, + "grad_norm": 9.219745635986328, + "learning_rate": 3.2356078775152185e-06, + "loss": 2.6025, + "step": 9026000 + }, + { + "epoch": 2.806018959629574, + "grad_norm": 11.272765159606934, + "learning_rate": 3.2330173395071035e-06, + "loss": 2.5792, + "step": 9026500 + }, + { + "epoch": 2.806174391910061, + "grad_norm": 11.944975852966309, + "learning_rate": 3.230426801498989e-06, + "loss": 2.5236, + "step": 9027000 + }, + { + "epoch": 2.8063298241905477, + "grad_norm": 17.3975887298584, + "learning_rate": 3.2278362634908744e-06, + "loss": 2.5848, + "step": 9027500 + }, + { + "epoch": 2.8064852564710345, + "grad_norm": 11.019776344299316, + "learning_rate": 3.2252457254827594e-06, + "loss": 2.5774, + "step": 9028000 + }, + { + "epoch": 2.8066406887515214, + "grad_norm": 17.20481300354004, + "learning_rate": 3.222655187474645e-06, + "loss": 2.6394, + "step": 9028500 + }, + { + "epoch": 2.8067961210320083, + "grad_norm": 8.640302658081055, + "learning_rate": 3.220064649466531e-06, + "loss": 2.5825, + "step": 9029000 + }, + { + "epoch": 2.806951553312495, + "grad_norm": 13.712369918823242, + "learning_rate": 3.2174741114584163e-06, + "loss": 2.5854, + "step": 9029500 + }, + { + "epoch": 2.807106985592982, + "grad_norm": 10.452913284301758, + "learning_rate": 3.2148835734503017e-06, + "loss": 2.572, + "step": 9030000 + }, + { + "epoch": 2.807262417873469, + "grad_norm": 10.921339988708496, + "learning_rate": 3.2122930354421867e-06, + "loss": 2.5648, + "step": 9030500 + }, + { + "epoch": 2.8074178501539557, + "grad_norm": 9.907848358154297, + "learning_rate": 3.209702497434072e-06, + "loss": 2.5566, + "step": 9031000 + }, + { + "epoch": 2.8075732824344426, + "grad_norm": 8.646288871765137, + "learning_rate": 3.2071119594259577e-06, + "loss": 2.5633, + "step": 9031500 + }, + { + "epoch": 2.8077287147149295, + "grad_norm": 11.825082778930664, + "learning_rate": 3.2045214214178427e-06, + "loss": 2.5697, + "step": 9032000 + }, + { + "epoch": 2.8078841469954163, + "grad_norm": 12.87171745300293, + "learning_rate": 3.201930883409728e-06, + "loss": 2.5266, + "step": 9032500 + }, + { + "epoch": 2.808039579275903, + "grad_norm": 10.981674194335938, + "learning_rate": 3.1993403454016136e-06, + "loss": 2.6143, + "step": 9033000 + }, + { + "epoch": 2.80819501155639, + "grad_norm": 11.554041862487793, + "learning_rate": 3.1967498073934995e-06, + "loss": 2.5907, + "step": 9033500 + }, + { + "epoch": 2.808350443836877, + "grad_norm": 9.390342712402344, + "learning_rate": 3.194159269385385e-06, + "loss": 2.5433, + "step": 9034000 + }, + { + "epoch": 2.808505876117364, + "grad_norm": 11.487808227539062, + "learning_rate": 3.19156873137727e-06, + "loss": 2.603, + "step": 9034500 + }, + { + "epoch": 2.8086613083978507, + "grad_norm": 11.279891014099121, + "learning_rate": 3.1889781933691555e-06, + "loss": 2.4934, + "step": 9035000 + }, + { + "epoch": 2.8088167406783375, + "grad_norm": 7.847511291503906, + "learning_rate": 3.186387655361041e-06, + "loss": 2.5508, + "step": 9035500 + }, + { + "epoch": 2.8089721729588244, + "grad_norm": 10.522965431213379, + "learning_rate": 3.183797117352926e-06, + "loss": 2.5311, + "step": 9036000 + }, + { + "epoch": 2.8091276052393113, + "grad_norm": 7.5739336013793945, + "learning_rate": 3.1812065793448114e-06, + "loss": 2.6133, + "step": 9036500 + }, + { + "epoch": 2.809283037519798, + "grad_norm": 8.40208625793457, + "learning_rate": 3.178616041336697e-06, + "loss": 2.5785, + "step": 9037000 + }, + { + "epoch": 2.809438469800285, + "grad_norm": 9.278820037841797, + "learning_rate": 3.1760255033285823e-06, + "loss": 2.5615, + "step": 9037500 + }, + { + "epoch": 2.809593902080772, + "grad_norm": 9.73277759552002, + "learning_rate": 3.173434965320468e-06, + "loss": 2.5849, + "step": 9038000 + }, + { + "epoch": 2.8097493343612587, + "grad_norm": 8.925422668457031, + "learning_rate": 3.1708444273123532e-06, + "loss": 2.5821, + "step": 9038500 + }, + { + "epoch": 2.8099047666417456, + "grad_norm": 10.92058277130127, + "learning_rate": 3.1682538893042387e-06, + "loss": 2.5964, + "step": 9039000 + }, + { + "epoch": 2.8100601989222325, + "grad_norm": 13.136312484741211, + "learning_rate": 3.165663351296124e-06, + "loss": 2.5816, + "step": 9039500 + }, + { + "epoch": 2.8102156312027193, + "grad_norm": 15.428627967834473, + "learning_rate": 3.1630728132880096e-06, + "loss": 2.5808, + "step": 9040000 + }, + { + "epoch": 2.810371063483206, + "grad_norm": 12.042881965637207, + "learning_rate": 3.1604822752798947e-06, + "loss": 2.5357, + "step": 9040500 + }, + { + "epoch": 2.810526495763693, + "grad_norm": 10.598442077636719, + "learning_rate": 3.15789173727178e-06, + "loss": 2.5957, + "step": 9041000 + }, + { + "epoch": 2.81068192804418, + "grad_norm": 10.469452857971191, + "learning_rate": 3.1553011992636656e-06, + "loss": 2.5167, + "step": 9041500 + }, + { + "epoch": 2.810837360324667, + "grad_norm": 12.892602920532227, + "learning_rate": 3.1527106612555506e-06, + "loss": 2.5685, + "step": 9042000 + }, + { + "epoch": 2.8109927926051537, + "grad_norm": 12.397443771362305, + "learning_rate": 3.150120123247437e-06, + "loss": 2.5694, + "step": 9042500 + }, + { + "epoch": 2.8111482248856405, + "grad_norm": 11.813603401184082, + "learning_rate": 3.147529585239322e-06, + "loss": 2.577, + "step": 9043000 + }, + { + "epoch": 2.8113036571661274, + "grad_norm": 10.970756530761719, + "learning_rate": 3.1449390472312074e-06, + "loss": 2.533, + "step": 9043500 + }, + { + "epoch": 2.8114590894466147, + "grad_norm": 9.258956909179688, + "learning_rate": 3.142348509223093e-06, + "loss": 2.6487, + "step": 9044000 + }, + { + "epoch": 2.811614521727101, + "grad_norm": 10.172006607055664, + "learning_rate": 3.139757971214978e-06, + "loss": 2.5597, + "step": 9044500 + }, + { + "epoch": 2.8117699540075884, + "grad_norm": 11.614072799682617, + "learning_rate": 3.1371674332068634e-06, + "loss": 2.5693, + "step": 9045000 + }, + { + "epoch": 2.811925386288075, + "grad_norm": 10.513394355773926, + "learning_rate": 3.134576895198749e-06, + "loss": 2.5624, + "step": 9045500 + }, + { + "epoch": 2.812080818568562, + "grad_norm": 10.44103717803955, + "learning_rate": 3.131986357190634e-06, + "loss": 2.565, + "step": 9046000 + }, + { + "epoch": 2.8122362508490486, + "grad_norm": 20.594112396240234, + "learning_rate": 3.1293958191825193e-06, + "loss": 2.5567, + "step": 9046500 + }, + { + "epoch": 2.812391683129536, + "grad_norm": 13.909222602844238, + "learning_rate": 3.126805281174405e-06, + "loss": 2.5606, + "step": 9047000 + }, + { + "epoch": 2.8125471154100223, + "grad_norm": 12.593005180358887, + "learning_rate": 3.1242147431662902e-06, + "loss": 2.5841, + "step": 9047500 + }, + { + "epoch": 2.8127025476905096, + "grad_norm": 9.04963207244873, + "learning_rate": 3.121624205158176e-06, + "loss": 2.5672, + "step": 9048000 + }, + { + "epoch": 2.812857979970996, + "grad_norm": 9.657503128051758, + "learning_rate": 3.119033667150061e-06, + "loss": 2.5709, + "step": 9048500 + }, + { + "epoch": 2.8130134122514834, + "grad_norm": 7.578089237213135, + "learning_rate": 3.1164431291419466e-06, + "loss": 2.5334, + "step": 9049000 + }, + { + "epoch": 2.8131688445319702, + "grad_norm": 8.066685676574707, + "learning_rate": 3.113852591133832e-06, + "loss": 2.5898, + "step": 9049500 + }, + { + "epoch": 2.813324276812457, + "grad_norm": 9.376800537109375, + "learning_rate": 3.1112620531257175e-06, + "loss": 2.5408, + "step": 9050000 + }, + { + "epoch": 2.813479709092944, + "grad_norm": 39.76849365234375, + "learning_rate": 3.108671515117603e-06, + "loss": 2.5508, + "step": 9050500 + }, + { + "epoch": 2.813635141373431, + "grad_norm": 15.24866771697998, + "learning_rate": 3.106080977109488e-06, + "loss": 2.5777, + "step": 9051000 + }, + { + "epoch": 2.8137905736539177, + "grad_norm": 12.58963680267334, + "learning_rate": 3.1034904391013735e-06, + "loss": 2.5687, + "step": 9051500 + }, + { + "epoch": 2.8139460059344046, + "grad_norm": 12.2692232131958, + "learning_rate": 3.100899901093259e-06, + "loss": 2.5876, + "step": 9052000 + }, + { + "epoch": 2.8141014382148914, + "grad_norm": 10.743963241577148, + "learning_rate": 3.0983093630851444e-06, + "loss": 2.5234, + "step": 9052500 + }, + { + "epoch": 2.8142568704953783, + "grad_norm": 10.07751750946045, + "learning_rate": 3.09571882507703e-06, + "loss": 2.5686, + "step": 9053000 + }, + { + "epoch": 2.814412302775865, + "grad_norm": 9.46142292022705, + "learning_rate": 3.0931282870689153e-06, + "loss": 2.5805, + "step": 9053500 + }, + { + "epoch": 2.814567735056352, + "grad_norm": 11.729752540588379, + "learning_rate": 3.0905377490608004e-06, + "loss": 2.592, + "step": 9054000 + }, + { + "epoch": 2.814723167336839, + "grad_norm": 8.940689086914062, + "learning_rate": 3.0879472110526863e-06, + "loss": 2.5403, + "step": 9054500 + }, + { + "epoch": 2.814878599617326, + "grad_norm": 9.23021411895752, + "learning_rate": 3.0853566730445713e-06, + "loss": 2.619, + "step": 9055000 + }, + { + "epoch": 2.8150340318978126, + "grad_norm": 13.260004043579102, + "learning_rate": 3.0827661350364567e-06, + "loss": 2.5364, + "step": 9055500 + }, + { + "epoch": 2.8151894641782995, + "grad_norm": 12.450726509094238, + "learning_rate": 3.080175597028342e-06, + "loss": 2.5255, + "step": 9056000 + }, + { + "epoch": 2.8153448964587864, + "grad_norm": 12.243297576904297, + "learning_rate": 3.0775850590202272e-06, + "loss": 2.6102, + "step": 9056500 + }, + { + "epoch": 2.8155003287392733, + "grad_norm": 11.62378215789795, + "learning_rate": 3.074994521012113e-06, + "loss": 2.5505, + "step": 9057000 + }, + { + "epoch": 2.81565576101976, + "grad_norm": 9.874835014343262, + "learning_rate": 3.0724039830039986e-06, + "loss": 2.4901, + "step": 9057500 + }, + { + "epoch": 2.815811193300247, + "grad_norm": 9.967620849609375, + "learning_rate": 3.0698134449958836e-06, + "loss": 2.5728, + "step": 9058000 + }, + { + "epoch": 2.815966625580734, + "grad_norm": 10.550124168395996, + "learning_rate": 3.067222906987769e-06, + "loss": 2.5749, + "step": 9058500 + }, + { + "epoch": 2.8161220578612207, + "grad_norm": 11.027125358581543, + "learning_rate": 3.0646323689796545e-06, + "loss": 2.5845, + "step": 9059000 + }, + { + "epoch": 2.8162774901417076, + "grad_norm": 10.569912910461426, + "learning_rate": 3.06204183097154e-06, + "loss": 2.5854, + "step": 9059500 + }, + { + "epoch": 2.8164329224221945, + "grad_norm": 9.305374145507812, + "learning_rate": 3.0594512929634255e-06, + "loss": 2.5651, + "step": 9060000 + }, + { + "epoch": 2.8165883547026813, + "grad_norm": 8.914966583251953, + "learning_rate": 3.0568607549553105e-06, + "loss": 2.5344, + "step": 9060500 + }, + { + "epoch": 2.816743786983168, + "grad_norm": 9.512202262878418, + "learning_rate": 3.054270216947196e-06, + "loss": 2.6171, + "step": 9061000 + }, + { + "epoch": 2.816899219263655, + "grad_norm": 9.938166618347168, + "learning_rate": 3.051679678939082e-06, + "loss": 2.5613, + "step": 9061500 + }, + { + "epoch": 2.817054651544142, + "grad_norm": 10.123220443725586, + "learning_rate": 3.049089140930967e-06, + "loss": 2.5992, + "step": 9062000 + }, + { + "epoch": 2.817210083824629, + "grad_norm": 12.523652076721191, + "learning_rate": 3.0464986029228523e-06, + "loss": 2.581, + "step": 9062500 + }, + { + "epoch": 2.8173655161051157, + "grad_norm": 7.289600372314453, + "learning_rate": 3.043908064914738e-06, + "loss": 2.5551, + "step": 9063000 + }, + { + "epoch": 2.8175209483856025, + "grad_norm": 7.0598978996276855, + "learning_rate": 3.0413175269066232e-06, + "loss": 2.5615, + "step": 9063500 + }, + { + "epoch": 2.8176763806660894, + "grad_norm": 11.288243293762207, + "learning_rate": 3.0387269888985087e-06, + "loss": 2.5507, + "step": 9064000 + }, + { + "epoch": 2.8178318129465763, + "grad_norm": 5.948212146759033, + "learning_rate": 3.036136450890394e-06, + "loss": 2.6042, + "step": 9064500 + }, + { + "epoch": 2.817987245227063, + "grad_norm": 10.967463493347168, + "learning_rate": 3.033545912882279e-06, + "loss": 2.5523, + "step": 9065000 + }, + { + "epoch": 2.81814267750755, + "grad_norm": 14.884970664978027, + "learning_rate": 3.0309553748741647e-06, + "loss": 2.5752, + "step": 9065500 + }, + { + "epoch": 2.818298109788037, + "grad_norm": 13.363718032836914, + "learning_rate": 3.02836483686605e-06, + "loss": 2.5551, + "step": 9066000 + }, + { + "epoch": 2.8184535420685237, + "grad_norm": 11.729796409606934, + "learning_rate": 3.0257742988579356e-06, + "loss": 2.5647, + "step": 9066500 + }, + { + "epoch": 2.8186089743490106, + "grad_norm": 10.119842529296875, + "learning_rate": 3.023183760849821e-06, + "loss": 2.5785, + "step": 9067000 + }, + { + "epoch": 2.818764406629498, + "grad_norm": 11.716385841369629, + "learning_rate": 3.020593222841706e-06, + "loss": 2.6165, + "step": 9067500 + }, + { + "epoch": 2.8189198389099843, + "grad_norm": 8.645180702209473, + "learning_rate": 3.018002684833592e-06, + "loss": 2.5333, + "step": 9068000 + }, + { + "epoch": 2.8190752711904716, + "grad_norm": 9.007549285888672, + "learning_rate": 3.0154121468254774e-06, + "loss": 2.5826, + "step": 9068500 + }, + { + "epoch": 2.819230703470958, + "grad_norm": 9.62087345123291, + "learning_rate": 3.0128216088173625e-06, + "loss": 2.5403, + "step": 9069000 + }, + { + "epoch": 2.8193861357514454, + "grad_norm": 12.620989799499512, + "learning_rate": 3.010231070809248e-06, + "loss": 2.5354, + "step": 9069500 + }, + { + "epoch": 2.819541568031932, + "grad_norm": 11.589129447937012, + "learning_rate": 3.0076405328011334e-06, + "loss": 2.5604, + "step": 9070000 + }, + { + "epoch": 2.819697000312419, + "grad_norm": 9.387422561645508, + "learning_rate": 3.005049994793019e-06, + "loss": 2.5322, + "step": 9070500 + }, + { + "epoch": 2.8198524325929055, + "grad_norm": 9.656906127929688, + "learning_rate": 3.0024594567849043e-06, + "loss": 2.5748, + "step": 9071000 + }, + { + "epoch": 2.820007864873393, + "grad_norm": 9.707310676574707, + "learning_rate": 2.9998689187767893e-06, + "loss": 2.5645, + "step": 9071500 + }, + { + "epoch": 2.8201632971538793, + "grad_norm": 8.072282791137695, + "learning_rate": 2.9972783807686748e-06, + "loss": 2.5694, + "step": 9072000 + }, + { + "epoch": 2.8203187294343666, + "grad_norm": 11.550827026367188, + "learning_rate": 2.9946878427605607e-06, + "loss": 2.5739, + "step": 9072500 + }, + { + "epoch": 2.820474161714853, + "grad_norm": 8.466736793518066, + "learning_rate": 2.9920973047524457e-06, + "loss": 2.5583, + "step": 9073000 + }, + { + "epoch": 2.8206295939953403, + "grad_norm": 9.559898376464844, + "learning_rate": 2.989506766744331e-06, + "loss": 2.5601, + "step": 9073500 + }, + { + "epoch": 2.820785026275827, + "grad_norm": 11.069580078125, + "learning_rate": 2.9869162287362166e-06, + "loss": 2.5395, + "step": 9074000 + }, + { + "epoch": 2.820940458556314, + "grad_norm": 8.961297035217285, + "learning_rate": 2.9843256907281017e-06, + "loss": 2.5729, + "step": 9074500 + }, + { + "epoch": 2.821095890836801, + "grad_norm": 9.295413970947266, + "learning_rate": 2.9817351527199875e-06, + "loss": 2.5844, + "step": 9075000 + }, + { + "epoch": 2.8212513231172878, + "grad_norm": 12.808403968811035, + "learning_rate": 2.9791446147118726e-06, + "loss": 2.5628, + "step": 9075500 + }, + { + "epoch": 2.8214067553977746, + "grad_norm": 12.395478248596191, + "learning_rate": 2.976554076703758e-06, + "loss": 2.5313, + "step": 9076000 + }, + { + "epoch": 2.8215621876782615, + "grad_norm": 21.941556930541992, + "learning_rate": 2.9739635386956435e-06, + "loss": 2.5423, + "step": 9076500 + }, + { + "epoch": 2.8217176199587484, + "grad_norm": 9.952011108398438, + "learning_rate": 2.971373000687529e-06, + "loss": 2.5009, + "step": 9077000 + }, + { + "epoch": 2.8218730522392352, + "grad_norm": 10.110517501831055, + "learning_rate": 2.9687824626794144e-06, + "loss": 2.5836, + "step": 9077500 + }, + { + "epoch": 2.822028484519722, + "grad_norm": 9.969439506530762, + "learning_rate": 2.9661919246713e-06, + "loss": 2.5828, + "step": 9078000 + }, + { + "epoch": 2.822183916800209, + "grad_norm": 8.873332977294922, + "learning_rate": 2.963601386663185e-06, + "loss": 2.5816, + "step": 9078500 + }, + { + "epoch": 2.822339349080696, + "grad_norm": 13.012557029724121, + "learning_rate": 2.961010848655071e-06, + "loss": 2.5865, + "step": 9079000 + }, + { + "epoch": 2.8224947813611827, + "grad_norm": 9.805108070373535, + "learning_rate": 2.958420310646956e-06, + "loss": 2.5477, + "step": 9079500 + }, + { + "epoch": 2.8226502136416696, + "grad_norm": 10.05923843383789, + "learning_rate": 2.9558297726388413e-06, + "loss": 2.5467, + "step": 9080000 + }, + { + "epoch": 2.8228056459221564, + "grad_norm": 16.456308364868164, + "learning_rate": 2.9532392346307267e-06, + "loss": 2.576, + "step": 9080500 + }, + { + "epoch": 2.8229610782026433, + "grad_norm": 11.064332008361816, + "learning_rate": 2.9506486966226118e-06, + "loss": 2.5898, + "step": 9081000 + }, + { + "epoch": 2.82311651048313, + "grad_norm": 10.308122634887695, + "learning_rate": 2.9480581586144977e-06, + "loss": 2.5713, + "step": 9081500 + }, + { + "epoch": 2.823271942763617, + "grad_norm": 10.365907669067383, + "learning_rate": 2.945467620606383e-06, + "loss": 2.5634, + "step": 9082000 + }, + { + "epoch": 2.823427375044104, + "grad_norm": 9.577983856201172, + "learning_rate": 2.942877082598268e-06, + "loss": 2.5444, + "step": 9082500 + }, + { + "epoch": 2.8235828073245908, + "grad_norm": 11.750701904296875, + "learning_rate": 2.9402865445901536e-06, + "loss": 2.5805, + "step": 9083000 + }, + { + "epoch": 2.8237382396050776, + "grad_norm": 12.873212814331055, + "learning_rate": 2.937696006582039e-06, + "loss": 2.5743, + "step": 9083500 + }, + { + "epoch": 2.8238936718855645, + "grad_norm": 47.50463104248047, + "learning_rate": 2.9351054685739245e-06, + "loss": 2.6484, + "step": 9084000 + }, + { + "epoch": 2.8240491041660514, + "grad_norm": 10.000661849975586, + "learning_rate": 2.93251493056581e-06, + "loss": 2.6138, + "step": 9084500 + }, + { + "epoch": 2.8242045364465382, + "grad_norm": 11.582756042480469, + "learning_rate": 2.929924392557695e-06, + "loss": 2.5302, + "step": 9085000 + }, + { + "epoch": 2.824359968727025, + "grad_norm": 9.947474479675293, + "learning_rate": 2.9273338545495805e-06, + "loss": 2.5563, + "step": 9085500 + }, + { + "epoch": 2.824515401007512, + "grad_norm": 11.061647415161133, + "learning_rate": 2.9247433165414664e-06, + "loss": 2.5676, + "step": 9086000 + }, + { + "epoch": 2.824670833287999, + "grad_norm": 9.169231414794922, + "learning_rate": 2.9221527785333514e-06, + "loss": 2.5622, + "step": 9086500 + }, + { + "epoch": 2.8248262655684857, + "grad_norm": 7.304764270782471, + "learning_rate": 2.919562240525237e-06, + "loss": 2.527, + "step": 9087000 + }, + { + "epoch": 2.8249816978489726, + "grad_norm": 9.394876480102539, + "learning_rate": 2.9169717025171223e-06, + "loss": 2.5523, + "step": 9087500 + }, + { + "epoch": 2.8251371301294594, + "grad_norm": 10.83205509185791, + "learning_rate": 2.914381164509008e-06, + "loss": 2.5529, + "step": 9088000 + }, + { + "epoch": 2.8252925624099463, + "grad_norm": 10.675625801086426, + "learning_rate": 2.9117906265008932e-06, + "loss": 2.5469, + "step": 9088500 + }, + { + "epoch": 2.825447994690433, + "grad_norm": 9.1421537399292, + "learning_rate": 2.9092000884927787e-06, + "loss": 2.5581, + "step": 9089000 + }, + { + "epoch": 2.82560342697092, + "grad_norm": 8.41319465637207, + "learning_rate": 2.9066095504846637e-06, + "loss": 2.5601, + "step": 9089500 + }, + { + "epoch": 2.825758859251407, + "grad_norm": 11.36243724822998, + "learning_rate": 2.904019012476549e-06, + "loss": 2.5757, + "step": 9090000 + }, + { + "epoch": 2.8259142915318938, + "grad_norm": 9.152433395385742, + "learning_rate": 2.9014284744684347e-06, + "loss": 2.5496, + "step": 9090500 + }, + { + "epoch": 2.8260697238123806, + "grad_norm": 12.688722610473633, + "learning_rate": 2.89883793646032e-06, + "loss": 2.578, + "step": 9091000 + }, + { + "epoch": 2.8262251560928675, + "grad_norm": 11.10831069946289, + "learning_rate": 2.8962473984522056e-06, + "loss": 2.5836, + "step": 9091500 + }, + { + "epoch": 2.826380588373355, + "grad_norm": 41.39500045776367, + "learning_rate": 2.8936568604440906e-06, + "loss": 2.562, + "step": 9092000 + }, + { + "epoch": 2.8265360206538412, + "grad_norm": 15.504651069641113, + "learning_rate": 2.8910663224359765e-06, + "loss": 2.5723, + "step": 9092500 + }, + { + "epoch": 2.8266914529343286, + "grad_norm": 13.570881843566895, + "learning_rate": 2.888475784427862e-06, + "loss": 2.5493, + "step": 9093000 + }, + { + "epoch": 2.826846885214815, + "grad_norm": 11.432106018066406, + "learning_rate": 2.885885246419747e-06, + "loss": 2.546, + "step": 9093500 + }, + { + "epoch": 2.8270023174953023, + "grad_norm": 9.953902244567871, + "learning_rate": 2.8832947084116325e-06, + "loss": 2.627, + "step": 9094000 + }, + { + "epoch": 2.8271577497757887, + "grad_norm": 10.073979377746582, + "learning_rate": 2.880704170403518e-06, + "loss": 2.6133, + "step": 9094500 + }, + { + "epoch": 2.827313182056276, + "grad_norm": 17.210874557495117, + "learning_rate": 2.8781136323954034e-06, + "loss": 2.5388, + "step": 9095000 + }, + { + "epoch": 2.8274686143367624, + "grad_norm": 10.514869689941406, + "learning_rate": 2.875523094387289e-06, + "loss": 2.5696, + "step": 9095500 + }, + { + "epoch": 2.8276240466172498, + "grad_norm": 10.712965965270996, + "learning_rate": 2.872932556379174e-06, + "loss": 2.5092, + "step": 9096000 + }, + { + "epoch": 2.827779478897736, + "grad_norm": 8.210408210754395, + "learning_rate": 2.8703420183710593e-06, + "loss": 2.5754, + "step": 9096500 + }, + { + "epoch": 2.8279349111782235, + "grad_norm": 10.418560028076172, + "learning_rate": 2.867751480362945e-06, + "loss": 2.537, + "step": 9097000 + }, + { + "epoch": 2.8280903434587104, + "grad_norm": 9.424196243286133, + "learning_rate": 2.8651609423548302e-06, + "loss": 2.5756, + "step": 9097500 + }, + { + "epoch": 2.828245775739197, + "grad_norm": 8.949015617370605, + "learning_rate": 2.8625704043467157e-06, + "loss": 2.5355, + "step": 9098000 + }, + { + "epoch": 2.828401208019684, + "grad_norm": 8.931544303894043, + "learning_rate": 2.859979866338601e-06, + "loss": 2.5987, + "step": 9098500 + }, + { + "epoch": 2.828556640300171, + "grad_norm": 12.17733097076416, + "learning_rate": 2.857389328330486e-06, + "loss": 2.5621, + "step": 9099000 + }, + { + "epoch": 2.828712072580658, + "grad_norm": 9.048295021057129, + "learning_rate": 2.854798790322372e-06, + "loss": 2.5515, + "step": 9099500 + }, + { + "epoch": 2.8288675048611447, + "grad_norm": 12.256664276123047, + "learning_rate": 2.852208252314257e-06, + "loss": 2.5427, + "step": 9100000 + }, + { + "epoch": 2.8290229371416316, + "grad_norm": 8.847677230834961, + "learning_rate": 2.8496177143061426e-06, + "loss": 2.5696, + "step": 9100500 + }, + { + "epoch": 2.8291783694221184, + "grad_norm": 10.955085754394531, + "learning_rate": 2.847027176298028e-06, + "loss": 2.5518, + "step": 9101000 + }, + { + "epoch": 2.8293338017026053, + "grad_norm": 8.49440860748291, + "learning_rate": 2.8444366382899135e-06, + "loss": 2.5643, + "step": 9101500 + }, + { + "epoch": 2.829489233983092, + "grad_norm": 8.80242919921875, + "learning_rate": 2.841846100281799e-06, + "loss": 2.5735, + "step": 9102000 + }, + { + "epoch": 2.829644666263579, + "grad_norm": 12.11811637878418, + "learning_rate": 2.8392555622736844e-06, + "loss": 2.6061, + "step": 9102500 + }, + { + "epoch": 2.829800098544066, + "grad_norm": 14.16651725769043, + "learning_rate": 2.8366650242655695e-06, + "loss": 2.5512, + "step": 9103000 + }, + { + "epoch": 2.8299555308245528, + "grad_norm": 16.693363189697266, + "learning_rate": 2.834074486257455e-06, + "loss": 2.5625, + "step": 9103500 + }, + { + "epoch": 2.8301109631050396, + "grad_norm": 9.611909866333008, + "learning_rate": 2.8314839482493404e-06, + "loss": 2.5654, + "step": 9104000 + }, + { + "epoch": 2.8302663953855265, + "grad_norm": 21.81202507019043, + "learning_rate": 2.828893410241226e-06, + "loss": 2.5794, + "step": 9104500 + }, + { + "epoch": 2.8304218276660134, + "grad_norm": 10.357547760009766, + "learning_rate": 2.8263028722331113e-06, + "loss": 2.5153, + "step": 9105000 + }, + { + "epoch": 2.8305772599465002, + "grad_norm": 11.311639785766602, + "learning_rate": 2.8237123342249963e-06, + "loss": 2.5634, + "step": 9105500 + }, + { + "epoch": 2.830732692226987, + "grad_norm": 10.183760643005371, + "learning_rate": 2.821121796216882e-06, + "loss": 2.5959, + "step": 9106000 + }, + { + "epoch": 2.830888124507474, + "grad_norm": 8.852886199951172, + "learning_rate": 2.8185312582087677e-06, + "loss": 2.5575, + "step": 9106500 + }, + { + "epoch": 2.831043556787961, + "grad_norm": 11.619453430175781, + "learning_rate": 2.8159407202006527e-06, + "loss": 2.5487, + "step": 9107000 + }, + { + "epoch": 2.8311989890684477, + "grad_norm": 14.64809513092041, + "learning_rate": 2.813350182192538e-06, + "loss": 2.5758, + "step": 9107500 + }, + { + "epoch": 2.8313544213489346, + "grad_norm": 9.593027114868164, + "learning_rate": 2.8107596441844236e-06, + "loss": 2.6008, + "step": 9108000 + }, + { + "epoch": 2.8315098536294214, + "grad_norm": 11.674287796020508, + "learning_rate": 2.808169106176309e-06, + "loss": 2.5747, + "step": 9108500 + }, + { + "epoch": 2.8316652859099083, + "grad_norm": 8.555713653564453, + "learning_rate": 2.8055785681681945e-06, + "loss": 2.4993, + "step": 9109000 + }, + { + "epoch": 2.831820718190395, + "grad_norm": 8.183697700500488, + "learning_rate": 2.8029880301600796e-06, + "loss": 2.4711, + "step": 9109500 + }, + { + "epoch": 2.831976150470882, + "grad_norm": 9.5299711227417, + "learning_rate": 2.800397492151965e-06, + "loss": 2.5483, + "step": 9110000 + }, + { + "epoch": 2.832131582751369, + "grad_norm": 10.36415958404541, + "learning_rate": 2.797806954143851e-06, + "loss": 2.5203, + "step": 9110500 + }, + { + "epoch": 2.8322870150318558, + "grad_norm": 9.534286499023438, + "learning_rate": 2.795216416135736e-06, + "loss": 2.5541, + "step": 9111000 + }, + { + "epoch": 2.8324424473123426, + "grad_norm": 9.950815200805664, + "learning_rate": 2.7926258781276214e-06, + "loss": 2.5096, + "step": 9111500 + }, + { + "epoch": 2.8325978795928295, + "grad_norm": 10.72596549987793, + "learning_rate": 2.790035340119507e-06, + "loss": 2.5852, + "step": 9112000 + }, + { + "epoch": 2.8327533118733164, + "grad_norm": 14.946500778198242, + "learning_rate": 2.7874448021113923e-06, + "loss": 2.5477, + "step": 9112500 + }, + { + "epoch": 2.8329087441538032, + "grad_norm": 11.817760467529297, + "learning_rate": 2.784854264103278e-06, + "loss": 2.5342, + "step": 9113000 + }, + { + "epoch": 2.83306417643429, + "grad_norm": 10.306166648864746, + "learning_rate": 2.7822637260951632e-06, + "loss": 2.5791, + "step": 9113500 + }, + { + "epoch": 2.833219608714777, + "grad_norm": 9.85966682434082, + "learning_rate": 2.7796731880870483e-06, + "loss": 2.5214, + "step": 9114000 + }, + { + "epoch": 2.833375040995264, + "grad_norm": 7.395770072937012, + "learning_rate": 2.7770826500789337e-06, + "loss": 2.5723, + "step": 9114500 + }, + { + "epoch": 2.8335304732757507, + "grad_norm": 14.033004760742188, + "learning_rate": 2.774492112070819e-06, + "loss": 2.531, + "step": 9115000 + }, + { + "epoch": 2.833685905556238, + "grad_norm": 13.573179244995117, + "learning_rate": 2.7719015740627047e-06, + "loss": 2.5758, + "step": 9115500 + }, + { + "epoch": 2.8338413378367244, + "grad_norm": 8.876496315002441, + "learning_rate": 2.76931103605459e-06, + "loss": 2.5705, + "step": 9116000 + }, + { + "epoch": 2.8339967701172117, + "grad_norm": 10.502772331237793, + "learning_rate": 2.766720498046475e-06, + "loss": 2.5337, + "step": 9116500 + }, + { + "epoch": 2.834152202397698, + "grad_norm": 9.383430480957031, + "learning_rate": 2.764129960038361e-06, + "loss": 2.5769, + "step": 9117000 + }, + { + "epoch": 2.8343076346781855, + "grad_norm": 12.497210502624512, + "learning_rate": 2.7615394220302465e-06, + "loss": 2.5169, + "step": 9117500 + }, + { + "epoch": 2.834463066958672, + "grad_norm": 9.200204849243164, + "learning_rate": 2.7589488840221315e-06, + "loss": 2.5945, + "step": 9118000 + }, + { + "epoch": 2.834618499239159, + "grad_norm": 12.103777885437012, + "learning_rate": 2.756358346014017e-06, + "loss": 2.575, + "step": 9118500 + }, + { + "epoch": 2.8347739315196456, + "grad_norm": 9.846861839294434, + "learning_rate": 2.7537678080059025e-06, + "loss": 2.5573, + "step": 9119000 + }, + { + "epoch": 2.834929363800133, + "grad_norm": 9.65036392211914, + "learning_rate": 2.751177269997788e-06, + "loss": 2.5622, + "step": 9119500 + }, + { + "epoch": 2.8350847960806194, + "grad_norm": 14.483942031860352, + "learning_rate": 2.7485867319896734e-06, + "loss": 2.5846, + "step": 9120000 + }, + { + "epoch": 2.8352402283611067, + "grad_norm": 12.19099235534668, + "learning_rate": 2.7459961939815584e-06, + "loss": 2.5357, + "step": 9120500 + }, + { + "epoch": 2.835395660641593, + "grad_norm": 9.832926750183105, + "learning_rate": 2.743405655973444e-06, + "loss": 2.5188, + "step": 9121000 + }, + { + "epoch": 2.8355510929220804, + "grad_norm": 8.812397003173828, + "learning_rate": 2.7408151179653298e-06, + "loss": 2.5832, + "step": 9121500 + }, + { + "epoch": 2.8357065252025673, + "grad_norm": 31.303905487060547, + "learning_rate": 2.7382245799572148e-06, + "loss": 2.5546, + "step": 9122000 + }, + { + "epoch": 2.835861957483054, + "grad_norm": 15.090863227844238, + "learning_rate": 2.7356340419491002e-06, + "loss": 2.5583, + "step": 9122500 + }, + { + "epoch": 2.836017389763541, + "grad_norm": 12.709002494812012, + "learning_rate": 2.7330435039409857e-06, + "loss": 2.5553, + "step": 9123000 + }, + { + "epoch": 2.836172822044028, + "grad_norm": 9.78985595703125, + "learning_rate": 2.7304529659328707e-06, + "loss": 2.6167, + "step": 9123500 + }, + { + "epoch": 2.8363282543245147, + "grad_norm": 14.21963882446289, + "learning_rate": 2.7278624279247566e-06, + "loss": 2.5526, + "step": 9124000 + }, + { + "epoch": 2.8364836866050016, + "grad_norm": 13.94539737701416, + "learning_rate": 2.7252718899166417e-06, + "loss": 2.5973, + "step": 9124500 + }, + { + "epoch": 2.8366391188854885, + "grad_norm": 30.483531951904297, + "learning_rate": 2.722681351908527e-06, + "loss": 2.5451, + "step": 9125000 + }, + { + "epoch": 2.8367945511659753, + "grad_norm": 11.220691680908203, + "learning_rate": 2.7200908139004126e-06, + "loss": 2.5563, + "step": 9125500 + }, + { + "epoch": 2.836949983446462, + "grad_norm": 9.418087005615234, + "learning_rate": 2.717500275892298e-06, + "loss": 2.5506, + "step": 9126000 + }, + { + "epoch": 2.837105415726949, + "grad_norm": 11.465091705322266, + "learning_rate": 2.7149097378841835e-06, + "loss": 2.5819, + "step": 9126500 + }, + { + "epoch": 2.837260848007436, + "grad_norm": 9.064719200134277, + "learning_rate": 2.712319199876069e-06, + "loss": 2.6162, + "step": 9127000 + }, + { + "epoch": 2.837416280287923, + "grad_norm": 8.507620811462402, + "learning_rate": 2.709728661867954e-06, + "loss": 2.5423, + "step": 9127500 + }, + { + "epoch": 2.8375717125684097, + "grad_norm": 10.531933784484863, + "learning_rate": 2.7071381238598395e-06, + "loss": 2.5981, + "step": 9128000 + }, + { + "epoch": 2.8377271448488965, + "grad_norm": 10.68731689453125, + "learning_rate": 2.704547585851725e-06, + "loss": 2.6053, + "step": 9128500 + }, + { + "epoch": 2.8378825771293834, + "grad_norm": 9.960143089294434, + "learning_rate": 2.7019570478436104e-06, + "loss": 2.5602, + "step": 9129000 + }, + { + "epoch": 2.8380380094098703, + "grad_norm": 10.765227317810059, + "learning_rate": 2.699366509835496e-06, + "loss": 2.5389, + "step": 9129500 + }, + { + "epoch": 2.838193441690357, + "grad_norm": 9.243003845214844, + "learning_rate": 2.696775971827381e-06, + "loss": 2.5429, + "step": 9130000 + }, + { + "epoch": 2.838348873970844, + "grad_norm": 10.6455717086792, + "learning_rate": 2.6941854338192667e-06, + "loss": 2.5868, + "step": 9130500 + }, + { + "epoch": 2.838504306251331, + "grad_norm": 8.442527770996094, + "learning_rate": 2.691594895811152e-06, + "loss": 2.5413, + "step": 9131000 + }, + { + "epoch": 2.8386597385318177, + "grad_norm": 43.354976654052734, + "learning_rate": 2.6890043578030372e-06, + "loss": 2.5645, + "step": 9131500 + }, + { + "epoch": 2.8388151708123046, + "grad_norm": 10.044071197509766, + "learning_rate": 2.6864138197949227e-06, + "loss": 2.6294, + "step": 9132000 + }, + { + "epoch": 2.8389706030927915, + "grad_norm": 8.654329299926758, + "learning_rate": 2.683823281786808e-06, + "loss": 2.5503, + "step": 9132500 + }, + { + "epoch": 2.8391260353732783, + "grad_norm": 12.90803050994873, + "learning_rate": 2.6812327437786936e-06, + "loss": 2.5567, + "step": 9133000 + }, + { + "epoch": 2.839281467653765, + "grad_norm": 28.20157814025879, + "learning_rate": 2.678642205770579e-06, + "loss": 2.5798, + "step": 9133500 + }, + { + "epoch": 2.839436899934252, + "grad_norm": 9.106420516967773, + "learning_rate": 2.676051667762464e-06, + "loss": 2.5558, + "step": 9134000 + }, + { + "epoch": 2.839592332214739, + "grad_norm": 38.17180633544922, + "learning_rate": 2.6734611297543496e-06, + "loss": 2.519, + "step": 9134500 + }, + { + "epoch": 2.839747764495226, + "grad_norm": 12.664758682250977, + "learning_rate": 2.6708705917462355e-06, + "loss": 2.5717, + "step": 9135000 + }, + { + "epoch": 2.8399031967757127, + "grad_norm": 9.90505599975586, + "learning_rate": 2.6682800537381205e-06, + "loss": 2.5639, + "step": 9135500 + }, + { + "epoch": 2.8400586290561995, + "grad_norm": 14.46729850769043, + "learning_rate": 2.665689515730006e-06, + "loss": 2.5171, + "step": 9136000 + }, + { + "epoch": 2.8402140613366864, + "grad_norm": 16.730751037597656, + "learning_rate": 2.6630989777218914e-06, + "loss": 2.5122, + "step": 9136500 + }, + { + "epoch": 2.8403694936171733, + "grad_norm": 42.33957290649414, + "learning_rate": 2.6605084397137764e-06, + "loss": 2.5606, + "step": 9137000 + }, + { + "epoch": 2.84052492589766, + "grad_norm": 9.279006004333496, + "learning_rate": 2.6579179017056623e-06, + "loss": 2.5885, + "step": 9137500 + }, + { + "epoch": 2.840680358178147, + "grad_norm": 14.368605613708496, + "learning_rate": 2.655327363697548e-06, + "loss": 2.5385, + "step": 9138000 + }, + { + "epoch": 2.840835790458634, + "grad_norm": 10.242383003234863, + "learning_rate": 2.652736825689433e-06, + "loss": 2.5155, + "step": 9138500 + }, + { + "epoch": 2.8409912227391207, + "grad_norm": 9.511886596679688, + "learning_rate": 2.6501462876813183e-06, + "loss": 2.5654, + "step": 9139000 + }, + { + "epoch": 2.8411466550196076, + "grad_norm": 9.150810241699219, + "learning_rate": 2.6475557496732037e-06, + "loss": 2.5662, + "step": 9139500 + }, + { + "epoch": 2.841302087300095, + "grad_norm": 10.771003723144531, + "learning_rate": 2.644965211665089e-06, + "loss": 2.5504, + "step": 9140000 + }, + { + "epoch": 2.8414575195805813, + "grad_norm": 11.267475128173828, + "learning_rate": 2.6423746736569747e-06, + "loss": 2.5662, + "step": 9140500 + }, + { + "epoch": 2.8416129518610687, + "grad_norm": 9.139908790588379, + "learning_rate": 2.6397841356488597e-06, + "loss": 2.5487, + "step": 9141000 + }, + { + "epoch": 2.841768384141555, + "grad_norm": 12.273515701293945, + "learning_rate": 2.637193597640745e-06, + "loss": 2.6027, + "step": 9141500 + }, + { + "epoch": 2.8419238164220424, + "grad_norm": 11.053633689880371, + "learning_rate": 2.634603059632631e-06, + "loss": 2.5998, + "step": 9142000 + }, + { + "epoch": 2.842079248702529, + "grad_norm": 10.170711517333984, + "learning_rate": 2.632012521624516e-06, + "loss": 2.5527, + "step": 9142500 + }, + { + "epoch": 2.842234680983016, + "grad_norm": 9.890172004699707, + "learning_rate": 2.6294219836164015e-06, + "loss": 2.5405, + "step": 9143000 + }, + { + "epoch": 2.8423901132635025, + "grad_norm": 10.507576942443848, + "learning_rate": 2.626831445608287e-06, + "loss": 2.5794, + "step": 9143500 + }, + { + "epoch": 2.84254554554399, + "grad_norm": 8.941609382629395, + "learning_rate": 2.6242409076001725e-06, + "loss": 2.5864, + "step": 9144000 + }, + { + "epoch": 2.8427009778244763, + "grad_norm": 7.894316673278809, + "learning_rate": 2.621650369592058e-06, + "loss": 2.5858, + "step": 9144500 + }, + { + "epoch": 2.8428564101049636, + "grad_norm": 8.851678848266602, + "learning_rate": 2.619059831583943e-06, + "loss": 2.5827, + "step": 9145000 + }, + { + "epoch": 2.8430118423854505, + "grad_norm": 13.141622543334961, + "learning_rate": 2.6164692935758284e-06, + "loss": 2.5917, + "step": 9145500 + }, + { + "epoch": 2.8431672746659373, + "grad_norm": 12.650850296020508, + "learning_rate": 2.6138787555677143e-06, + "loss": 2.5356, + "step": 9146000 + }, + { + "epoch": 2.843322706946424, + "grad_norm": 40.215660095214844, + "learning_rate": 2.6112882175595993e-06, + "loss": 2.5789, + "step": 9146500 + }, + { + "epoch": 2.843478139226911, + "grad_norm": 10.206186294555664, + "learning_rate": 2.6086976795514848e-06, + "loss": 2.5333, + "step": 9147000 + }, + { + "epoch": 2.843633571507398, + "grad_norm": 9.692111015319824, + "learning_rate": 2.6061071415433702e-06, + "loss": 2.6157, + "step": 9147500 + }, + { + "epoch": 2.843789003787885, + "grad_norm": 11.92597770690918, + "learning_rate": 2.6035166035352553e-06, + "loss": 2.5213, + "step": 9148000 + }, + { + "epoch": 2.8439444360683717, + "grad_norm": 9.135348320007324, + "learning_rate": 2.600926065527141e-06, + "loss": 2.5516, + "step": 9148500 + }, + { + "epoch": 2.8440998683488585, + "grad_norm": 9.754507064819336, + "learning_rate": 2.598335527519026e-06, + "loss": 2.5832, + "step": 9149000 + }, + { + "epoch": 2.8442553006293454, + "grad_norm": 11.107542037963867, + "learning_rate": 2.5957449895109117e-06, + "loss": 2.5507, + "step": 9149500 + }, + { + "epoch": 2.8444107329098323, + "grad_norm": 28.433305740356445, + "learning_rate": 2.593154451502797e-06, + "loss": 2.5123, + "step": 9150000 + }, + { + "epoch": 2.844566165190319, + "grad_norm": 11.129348754882812, + "learning_rate": 2.5905639134946826e-06, + "loss": 2.5723, + "step": 9150500 + }, + { + "epoch": 2.844721597470806, + "grad_norm": 15.191725730895996, + "learning_rate": 2.587973375486568e-06, + "loss": 2.5386, + "step": 9151000 + }, + { + "epoch": 2.844877029751293, + "grad_norm": 9.265682220458984, + "learning_rate": 2.5853828374784535e-06, + "loss": 2.5422, + "step": 9151500 + }, + { + "epoch": 2.8450324620317797, + "grad_norm": 15.339303016662598, + "learning_rate": 2.5827922994703385e-06, + "loss": 2.5588, + "step": 9152000 + }, + { + "epoch": 2.8451878943122666, + "grad_norm": 10.617773056030273, + "learning_rate": 2.580201761462224e-06, + "loss": 2.5772, + "step": 9152500 + }, + { + "epoch": 2.8453433265927535, + "grad_norm": 12.080373764038086, + "learning_rate": 2.5776112234541095e-06, + "loss": 2.5456, + "step": 9153000 + }, + { + "epoch": 2.8454987588732403, + "grad_norm": 11.105497360229492, + "learning_rate": 2.575020685445995e-06, + "loss": 2.5897, + "step": 9153500 + }, + { + "epoch": 2.845654191153727, + "grad_norm": 12.07433795928955, + "learning_rate": 2.5724301474378804e-06, + "loss": 2.5886, + "step": 9154000 + }, + { + "epoch": 2.845809623434214, + "grad_norm": 13.287981033325195, + "learning_rate": 2.5698396094297654e-06, + "loss": 2.5473, + "step": 9154500 + }, + { + "epoch": 2.845965055714701, + "grad_norm": 12.60881233215332, + "learning_rate": 2.5672490714216513e-06, + "loss": 2.5907, + "step": 9155000 + }, + { + "epoch": 2.846120487995188, + "grad_norm": 10.578124046325684, + "learning_rate": 2.5646585334135367e-06, + "loss": 2.5498, + "step": 9155500 + }, + { + "epoch": 2.8462759202756747, + "grad_norm": 31.22245979309082, + "learning_rate": 2.5620679954054218e-06, + "loss": 2.5723, + "step": 9156000 + }, + { + "epoch": 2.8464313525561615, + "grad_norm": 9.857291221618652, + "learning_rate": 2.5594774573973072e-06, + "loss": 2.532, + "step": 9156500 + }, + { + "epoch": 2.8465867848366484, + "grad_norm": 11.708579063415527, + "learning_rate": 2.5568869193891927e-06, + "loss": 2.5766, + "step": 9157000 + }, + { + "epoch": 2.8467422171171353, + "grad_norm": 11.37524700164795, + "learning_rate": 2.554296381381078e-06, + "loss": 2.6034, + "step": 9157500 + }, + { + "epoch": 2.846897649397622, + "grad_norm": 8.261534690856934, + "learning_rate": 2.5517058433729636e-06, + "loss": 2.5278, + "step": 9158000 + }, + { + "epoch": 2.847053081678109, + "grad_norm": 10.44766616821289, + "learning_rate": 2.549115305364849e-06, + "loss": 2.5822, + "step": 9158500 + }, + { + "epoch": 2.847208513958596, + "grad_norm": 10.344866752624512, + "learning_rate": 2.546524767356734e-06, + "loss": 2.5374, + "step": 9159000 + }, + { + "epoch": 2.8473639462390827, + "grad_norm": 9.889067649841309, + "learning_rate": 2.54393422934862e-06, + "loss": 2.5755, + "step": 9159500 + }, + { + "epoch": 2.8475193785195696, + "grad_norm": 15.25700569152832, + "learning_rate": 2.541343691340505e-06, + "loss": 2.5674, + "step": 9160000 + }, + { + "epoch": 2.8476748108000565, + "grad_norm": 12.049497604370117, + "learning_rate": 2.5387531533323905e-06, + "loss": 2.581, + "step": 9160500 + }, + { + "epoch": 2.8478302430805433, + "grad_norm": 9.350799560546875, + "learning_rate": 2.536162615324276e-06, + "loss": 2.5346, + "step": 9161000 + }, + { + "epoch": 2.84798567536103, + "grad_norm": 9.21042537689209, + "learning_rate": 2.533572077316161e-06, + "loss": 2.5516, + "step": 9161500 + }, + { + "epoch": 2.848141107641517, + "grad_norm": 12.438009262084961, + "learning_rate": 2.530981539308047e-06, + "loss": 2.5478, + "step": 9162000 + }, + { + "epoch": 2.848296539922004, + "grad_norm": 10.14573860168457, + "learning_rate": 2.5283910012999323e-06, + "loss": 2.5795, + "step": 9162500 + }, + { + "epoch": 2.848451972202491, + "grad_norm": 13.251081466674805, + "learning_rate": 2.5258004632918174e-06, + "loss": 2.5467, + "step": 9163000 + }, + { + "epoch": 2.8486074044829777, + "grad_norm": 9.2969388961792, + "learning_rate": 2.523209925283703e-06, + "loss": 2.5175, + "step": 9163500 + }, + { + "epoch": 2.8487628367634645, + "grad_norm": 9.780869483947754, + "learning_rate": 2.5206193872755883e-06, + "loss": 2.576, + "step": 9164000 + }, + { + "epoch": 2.848918269043952, + "grad_norm": 7.783201217651367, + "learning_rate": 2.5180288492674737e-06, + "loss": 2.5509, + "step": 9164500 + }, + { + "epoch": 2.8490737013244383, + "grad_norm": 10.084342956542969, + "learning_rate": 2.515438311259359e-06, + "loss": 2.5371, + "step": 9165000 + }, + { + "epoch": 2.8492291336049256, + "grad_norm": 9.29798698425293, + "learning_rate": 2.5128477732512442e-06, + "loss": 2.551, + "step": 9165500 + }, + { + "epoch": 2.849384565885412, + "grad_norm": 9.979815483093262, + "learning_rate": 2.5102572352431297e-06, + "loss": 2.5438, + "step": 9166000 + }, + { + "epoch": 2.8495399981658993, + "grad_norm": 20.827714920043945, + "learning_rate": 2.5076666972350156e-06, + "loss": 2.5585, + "step": 9166500 + }, + { + "epoch": 2.8496954304463857, + "grad_norm": 8.914770126342773, + "learning_rate": 2.5050761592269006e-06, + "loss": 2.5678, + "step": 9167000 + }, + { + "epoch": 2.849850862726873, + "grad_norm": 10.8153715133667, + "learning_rate": 2.502485621218786e-06, + "loss": 2.567, + "step": 9167500 + }, + { + "epoch": 2.8500062950073595, + "grad_norm": 12.198121070861816, + "learning_rate": 2.4998950832106715e-06, + "loss": 2.5697, + "step": 9168000 + }, + { + "epoch": 2.850161727287847, + "grad_norm": 8.236361503601074, + "learning_rate": 2.497304545202557e-06, + "loss": 2.5611, + "step": 9168500 + }, + { + "epoch": 2.850317159568333, + "grad_norm": 13.1848726272583, + "learning_rate": 2.4947140071944425e-06, + "loss": 2.5531, + "step": 9169000 + }, + { + "epoch": 2.8504725918488205, + "grad_norm": 10.67041015625, + "learning_rate": 2.4921234691863275e-06, + "loss": 2.5473, + "step": 9169500 + }, + { + "epoch": 2.8506280241293074, + "grad_norm": 21.484994888305664, + "learning_rate": 2.489532931178213e-06, + "loss": 2.5691, + "step": 9170000 + }, + { + "epoch": 2.8507834564097942, + "grad_norm": 9.529518127441406, + "learning_rate": 2.4869423931700984e-06, + "loss": 2.5438, + "step": 9170500 + }, + { + "epoch": 2.850938888690281, + "grad_norm": 10.115939140319824, + "learning_rate": 2.484351855161984e-06, + "loss": 2.5008, + "step": 9171000 + }, + { + "epoch": 2.851094320970768, + "grad_norm": 11.442014694213867, + "learning_rate": 2.4817613171538693e-06, + "loss": 2.5659, + "step": 9171500 + }, + { + "epoch": 2.851249753251255, + "grad_norm": 15.940937995910645, + "learning_rate": 2.4791707791457548e-06, + "loss": 2.575, + "step": 9172000 + }, + { + "epoch": 2.8514051855317417, + "grad_norm": 18.828166961669922, + "learning_rate": 2.47658024113764e-06, + "loss": 2.5662, + "step": 9172500 + }, + { + "epoch": 2.8515606178122286, + "grad_norm": 8.179937362670898, + "learning_rate": 2.4739897031295257e-06, + "loss": 2.5703, + "step": 9173000 + }, + { + "epoch": 2.8517160500927154, + "grad_norm": 9.999612808227539, + "learning_rate": 2.4713991651214107e-06, + "loss": 2.6088, + "step": 9173500 + }, + { + "epoch": 2.8518714823732023, + "grad_norm": 12.03852367401123, + "learning_rate": 2.468808627113296e-06, + "loss": 2.5431, + "step": 9174000 + }, + { + "epoch": 2.852026914653689, + "grad_norm": 9.286359786987305, + "learning_rate": 2.4662180891051817e-06, + "loss": 2.5376, + "step": 9174500 + }, + { + "epoch": 2.852182346934176, + "grad_norm": 8.684741020202637, + "learning_rate": 2.4636275510970667e-06, + "loss": 2.5116, + "step": 9175000 + }, + { + "epoch": 2.852337779214663, + "grad_norm": 15.34786319732666, + "learning_rate": 2.4610370130889526e-06, + "loss": 2.5575, + "step": 9175500 + }, + { + "epoch": 2.85249321149515, + "grad_norm": 10.442316055297852, + "learning_rate": 2.458446475080838e-06, + "loss": 2.5633, + "step": 9176000 + }, + { + "epoch": 2.8526486437756366, + "grad_norm": 11.251178741455078, + "learning_rate": 2.455855937072723e-06, + "loss": 2.5977, + "step": 9176500 + }, + { + "epoch": 2.8528040760561235, + "grad_norm": 16.304210662841797, + "learning_rate": 2.4532653990646085e-06, + "loss": 2.5557, + "step": 9177000 + }, + { + "epoch": 2.8529595083366104, + "grad_norm": 10.646988868713379, + "learning_rate": 2.450674861056494e-06, + "loss": 2.5285, + "step": 9177500 + }, + { + "epoch": 2.8531149406170973, + "grad_norm": 13.814742088317871, + "learning_rate": 2.4480843230483795e-06, + "loss": 2.5772, + "step": 9178000 + }, + { + "epoch": 2.853270372897584, + "grad_norm": 9.864331245422363, + "learning_rate": 2.445493785040265e-06, + "loss": 2.5541, + "step": 9178500 + }, + { + "epoch": 2.853425805178071, + "grad_norm": 11.553482055664062, + "learning_rate": 2.44290324703215e-06, + "loss": 2.4977, + "step": 9179000 + }, + { + "epoch": 2.853581237458558, + "grad_norm": 10.76351261138916, + "learning_rate": 2.440312709024036e-06, + "loss": 2.5869, + "step": 9179500 + }, + { + "epoch": 2.8537366697390447, + "grad_norm": 17.368247985839844, + "learning_rate": 2.4377221710159213e-06, + "loss": 2.5578, + "step": 9180000 + }, + { + "epoch": 2.8538921020195316, + "grad_norm": 11.916595458984375, + "learning_rate": 2.4351316330078063e-06, + "loss": 2.5526, + "step": 9180500 + }, + { + "epoch": 2.8540475343000185, + "grad_norm": 8.559293746948242, + "learning_rate": 2.4325410949996918e-06, + "loss": 2.582, + "step": 9181000 + }, + { + "epoch": 2.8542029665805053, + "grad_norm": 15.419880867004395, + "learning_rate": 2.4299505569915772e-06, + "loss": 2.5485, + "step": 9181500 + }, + { + "epoch": 2.854358398860992, + "grad_norm": 15.974517822265625, + "learning_rate": 2.4273600189834627e-06, + "loss": 2.5702, + "step": 9182000 + }, + { + "epoch": 2.854513831141479, + "grad_norm": 9.768399238586426, + "learning_rate": 2.424769480975348e-06, + "loss": 2.5484, + "step": 9182500 + }, + { + "epoch": 2.854669263421966, + "grad_norm": 7.994653701782227, + "learning_rate": 2.4221789429672336e-06, + "loss": 2.5196, + "step": 9183000 + }, + { + "epoch": 2.854824695702453, + "grad_norm": 10.089919090270996, + "learning_rate": 2.4195884049591187e-06, + "loss": 2.5567, + "step": 9183500 + }, + { + "epoch": 2.8549801279829397, + "grad_norm": 9.240706443786621, + "learning_rate": 2.4169978669510045e-06, + "loss": 2.5137, + "step": 9184000 + }, + { + "epoch": 2.8551355602634265, + "grad_norm": 8.34070110321045, + "learning_rate": 2.4144073289428896e-06, + "loss": 2.6184, + "step": 9184500 + }, + { + "epoch": 2.8552909925439134, + "grad_norm": 9.755051612854004, + "learning_rate": 2.411816790934775e-06, + "loss": 2.5756, + "step": 9185000 + }, + { + "epoch": 2.8554464248244003, + "grad_norm": 16.976171493530273, + "learning_rate": 2.4092262529266605e-06, + "loss": 2.5724, + "step": 9185500 + }, + { + "epoch": 2.855601857104887, + "grad_norm": 10.737174987792969, + "learning_rate": 2.4066357149185455e-06, + "loss": 2.5674, + "step": 9186000 + }, + { + "epoch": 2.855757289385374, + "grad_norm": 11.509466171264648, + "learning_rate": 2.4040451769104314e-06, + "loss": 2.5641, + "step": 9186500 + }, + { + "epoch": 2.855912721665861, + "grad_norm": 12.264230728149414, + "learning_rate": 2.401454638902317e-06, + "loss": 2.5367, + "step": 9187000 + }, + { + "epoch": 2.8560681539463477, + "grad_norm": 11.219582557678223, + "learning_rate": 2.398864100894202e-06, + "loss": 2.5603, + "step": 9187500 + }, + { + "epoch": 2.856223586226835, + "grad_norm": 12.680868148803711, + "learning_rate": 2.3962735628860874e-06, + "loss": 2.5227, + "step": 9188000 + }, + { + "epoch": 2.8563790185073215, + "grad_norm": 7.198174953460693, + "learning_rate": 2.393683024877973e-06, + "loss": 2.5638, + "step": 9188500 + }, + { + "epoch": 2.8565344507878088, + "grad_norm": 10.40397834777832, + "learning_rate": 2.3910924868698583e-06, + "loss": 2.5572, + "step": 9189000 + }, + { + "epoch": 2.856689883068295, + "grad_norm": 10.229960441589355, + "learning_rate": 2.3885019488617437e-06, + "loss": 2.5575, + "step": 9189500 + }, + { + "epoch": 2.8568453153487825, + "grad_norm": 11.089405059814453, + "learning_rate": 2.3859114108536288e-06, + "loss": 2.5711, + "step": 9190000 + }, + { + "epoch": 2.857000747629269, + "grad_norm": 10.197100639343262, + "learning_rate": 2.3833208728455142e-06, + "loss": 2.5499, + "step": 9190500 + }, + { + "epoch": 2.8571561799097562, + "grad_norm": 17.446866989135742, + "learning_rate": 2.3807303348374e-06, + "loss": 2.545, + "step": 9191000 + }, + { + "epoch": 2.8573116121902427, + "grad_norm": 18.4987735748291, + "learning_rate": 2.378139796829285e-06, + "loss": 2.5111, + "step": 9191500 + }, + { + "epoch": 2.85746704447073, + "grad_norm": 11.752022743225098, + "learning_rate": 2.3755492588211706e-06, + "loss": 2.5438, + "step": 9192000 + }, + { + "epoch": 2.8576224767512164, + "grad_norm": 10.041058540344238, + "learning_rate": 2.372958720813056e-06, + "loss": 2.5672, + "step": 9192500 + }, + { + "epoch": 2.8577779090317037, + "grad_norm": 11.258054733276367, + "learning_rate": 2.3703681828049415e-06, + "loss": 2.5773, + "step": 9193000 + }, + { + "epoch": 2.85793334131219, + "grad_norm": 16.015727996826172, + "learning_rate": 2.367777644796827e-06, + "loss": 2.5407, + "step": 9193500 + }, + { + "epoch": 2.8580887735926774, + "grad_norm": 6.494688510894775, + "learning_rate": 2.365187106788712e-06, + "loss": 2.5698, + "step": 9194000 + }, + { + "epoch": 2.8582442058731643, + "grad_norm": 6.3348164558410645, + "learning_rate": 2.3625965687805975e-06, + "loss": 2.5542, + "step": 9194500 + }, + { + "epoch": 2.858399638153651, + "grad_norm": 11.677184104919434, + "learning_rate": 2.360006030772483e-06, + "loss": 2.5456, + "step": 9195000 + }, + { + "epoch": 2.858555070434138, + "grad_norm": 11.449917793273926, + "learning_rate": 2.3574154927643684e-06, + "loss": 2.5273, + "step": 9195500 + }, + { + "epoch": 2.858710502714625, + "grad_norm": 10.511784553527832, + "learning_rate": 2.354824954756254e-06, + "loss": 2.5676, + "step": 9196000 + }, + { + "epoch": 2.8588659349951118, + "grad_norm": 10.227781295776367, + "learning_rate": 2.3522344167481393e-06, + "loss": 2.5413, + "step": 9196500 + }, + { + "epoch": 2.8590213672755986, + "grad_norm": 9.609048843383789, + "learning_rate": 2.3496438787400244e-06, + "loss": 2.6021, + "step": 9197000 + }, + { + "epoch": 2.8591767995560855, + "grad_norm": 10.332019805908203, + "learning_rate": 2.3470533407319102e-06, + "loss": 2.5375, + "step": 9197500 + }, + { + "epoch": 2.8593322318365724, + "grad_norm": 10.990618705749512, + "learning_rate": 2.3444628027237953e-06, + "loss": 2.562, + "step": 9198000 + }, + { + "epoch": 2.8594876641170592, + "grad_norm": 10.900579452514648, + "learning_rate": 2.3418722647156807e-06, + "loss": 2.5915, + "step": 9198500 + }, + { + "epoch": 2.859643096397546, + "grad_norm": 11.122231483459473, + "learning_rate": 2.339281726707566e-06, + "loss": 2.5643, + "step": 9199000 + }, + { + "epoch": 2.859798528678033, + "grad_norm": 8.30380916595459, + "learning_rate": 2.3366911886994512e-06, + "loss": 2.5496, + "step": 9199500 + }, + { + "epoch": 2.85995396095852, + "grad_norm": 8.8403902053833, + "learning_rate": 2.334100650691337e-06, + "loss": 2.6078, + "step": 9200000 + }, + { + "epoch": 2.8601093932390067, + "grad_norm": 11.97409725189209, + "learning_rate": 2.3315101126832226e-06, + "loss": 2.6076, + "step": 9200500 + }, + { + "epoch": 2.8602648255194936, + "grad_norm": 9.451781272888184, + "learning_rate": 2.3289195746751076e-06, + "loss": 2.5204, + "step": 9201000 + }, + { + "epoch": 2.8604202577999804, + "grad_norm": 11.115578651428223, + "learning_rate": 2.326329036666993e-06, + "loss": 2.5716, + "step": 9201500 + }, + { + "epoch": 2.8605756900804673, + "grad_norm": 8.852158546447754, + "learning_rate": 2.3237384986588785e-06, + "loss": 2.5407, + "step": 9202000 + }, + { + "epoch": 2.860731122360954, + "grad_norm": 47.48777770996094, + "learning_rate": 2.321147960650764e-06, + "loss": 2.5245, + "step": 9202500 + }, + { + "epoch": 2.860886554641441, + "grad_norm": 13.507616996765137, + "learning_rate": 2.3185574226426495e-06, + "loss": 2.5487, + "step": 9203000 + }, + { + "epoch": 2.861041986921928, + "grad_norm": 20.825279235839844, + "learning_rate": 2.3159668846345345e-06, + "loss": 2.5695, + "step": 9203500 + }, + { + "epoch": 2.8611974192024148, + "grad_norm": 5.974109649658203, + "learning_rate": 2.31337634662642e-06, + "loss": 2.5328, + "step": 9204000 + }, + { + "epoch": 2.8613528514829016, + "grad_norm": 10.93310546875, + "learning_rate": 2.310785808618306e-06, + "loss": 2.5845, + "step": 9204500 + }, + { + "epoch": 2.8615082837633885, + "grad_norm": 8.39892578125, + "learning_rate": 2.308195270610191e-06, + "loss": 2.527, + "step": 9205000 + }, + { + "epoch": 2.8616637160438754, + "grad_norm": 9.037941932678223, + "learning_rate": 2.3056047326020763e-06, + "loss": 2.5718, + "step": 9205500 + }, + { + "epoch": 2.8618191483243622, + "grad_norm": 10.419159889221191, + "learning_rate": 2.3030141945939618e-06, + "loss": 2.5521, + "step": 9206000 + }, + { + "epoch": 2.861974580604849, + "grad_norm": 9.828124046325684, + "learning_rate": 2.3004236565858472e-06, + "loss": 2.5009, + "step": 9206500 + }, + { + "epoch": 2.862130012885336, + "grad_norm": 8.99098014831543, + "learning_rate": 2.2978331185777327e-06, + "loss": 2.5018, + "step": 9207000 + }, + { + "epoch": 2.862285445165823, + "grad_norm": 10.585124969482422, + "learning_rate": 2.295242580569618e-06, + "loss": 2.5544, + "step": 9207500 + }, + { + "epoch": 2.8624408774463097, + "grad_norm": 9.952974319458008, + "learning_rate": 2.292652042561503e-06, + "loss": 2.4903, + "step": 9208000 + }, + { + "epoch": 2.8625963097267966, + "grad_norm": 9.195089340209961, + "learning_rate": 2.2900615045533887e-06, + "loss": 2.5541, + "step": 9208500 + }, + { + "epoch": 2.8627517420072834, + "grad_norm": 10.171996116638184, + "learning_rate": 2.287470966545274e-06, + "loss": 2.4902, + "step": 9209000 + }, + { + "epoch": 2.8629071742877703, + "grad_norm": 11.279616355895996, + "learning_rate": 2.2848804285371596e-06, + "loss": 2.5882, + "step": 9209500 + }, + { + "epoch": 2.863062606568257, + "grad_norm": 10.670083999633789, + "learning_rate": 2.282289890529045e-06, + "loss": 2.5322, + "step": 9210000 + }, + { + "epoch": 2.863218038848744, + "grad_norm": 10.154560089111328, + "learning_rate": 2.27969935252093e-06, + "loss": 2.6071, + "step": 9210500 + }, + { + "epoch": 2.863373471129231, + "grad_norm": 10.233804702758789, + "learning_rate": 2.277108814512816e-06, + "loss": 2.5472, + "step": 9211000 + }, + { + "epoch": 2.8635289034097178, + "grad_norm": 10.243959426879883, + "learning_rate": 2.2745182765047014e-06, + "loss": 2.5287, + "step": 9211500 + }, + { + "epoch": 2.8636843356902046, + "grad_norm": 11.699769973754883, + "learning_rate": 2.2719277384965864e-06, + "loss": 2.556, + "step": 9212000 + }, + { + "epoch": 2.863839767970692, + "grad_norm": 17.22662353515625, + "learning_rate": 2.269337200488472e-06, + "loss": 2.5397, + "step": 9212500 + }, + { + "epoch": 2.8639952002511784, + "grad_norm": 9.645674705505371, + "learning_rate": 2.2667466624803574e-06, + "loss": 2.5372, + "step": 9213000 + }, + { + "epoch": 2.8641506325316657, + "grad_norm": 9.611234664916992, + "learning_rate": 2.264156124472243e-06, + "loss": 2.5223, + "step": 9213500 + }, + { + "epoch": 2.864306064812152, + "grad_norm": 11.610940933227539, + "learning_rate": 2.2615655864641283e-06, + "loss": 2.5481, + "step": 9214000 + }, + { + "epoch": 2.8644614970926394, + "grad_norm": 10.118257522583008, + "learning_rate": 2.2589750484560133e-06, + "loss": 2.5059, + "step": 9214500 + }, + { + "epoch": 2.864616929373126, + "grad_norm": 34.13334274291992, + "learning_rate": 2.2563845104478988e-06, + "loss": 2.5234, + "step": 9215000 + }, + { + "epoch": 2.864772361653613, + "grad_norm": 8.157570838928223, + "learning_rate": 2.2537939724397847e-06, + "loss": 2.5696, + "step": 9215500 + }, + { + "epoch": 2.8649277939340996, + "grad_norm": 9.323088645935059, + "learning_rate": 2.2512034344316697e-06, + "loss": 2.5327, + "step": 9216000 + }, + { + "epoch": 2.865083226214587, + "grad_norm": 10.502217292785645, + "learning_rate": 2.248612896423555e-06, + "loss": 2.5509, + "step": 9216500 + }, + { + "epoch": 2.8652386584950733, + "grad_norm": 9.331470489501953, + "learning_rate": 2.2460223584154406e-06, + "loss": 2.5357, + "step": 9217000 + }, + { + "epoch": 2.8653940907755606, + "grad_norm": 11.988970756530762, + "learning_rate": 2.243431820407326e-06, + "loss": 2.5676, + "step": 9217500 + }, + { + "epoch": 2.8655495230560475, + "grad_norm": 12.771717071533203, + "learning_rate": 2.2408412823992115e-06, + "loss": 2.5848, + "step": 9218000 + }, + { + "epoch": 2.8657049553365344, + "grad_norm": 9.921781539916992, + "learning_rate": 2.2382507443910966e-06, + "loss": 2.5759, + "step": 9218500 + }, + { + "epoch": 2.865860387617021, + "grad_norm": 10.086572647094727, + "learning_rate": 2.235660206382982e-06, + "loss": 2.5767, + "step": 9219000 + }, + { + "epoch": 2.866015819897508, + "grad_norm": 8.74293327331543, + "learning_rate": 2.2330696683748675e-06, + "loss": 2.5396, + "step": 9219500 + }, + { + "epoch": 2.866171252177995, + "grad_norm": 9.47861099243164, + "learning_rate": 2.230479130366753e-06, + "loss": 2.5642, + "step": 9220000 + }, + { + "epoch": 2.866326684458482, + "grad_norm": 9.596108436584473, + "learning_rate": 2.2278885923586384e-06, + "loss": 2.5372, + "step": 9220500 + }, + { + "epoch": 2.8664821167389687, + "grad_norm": 6.232234477996826, + "learning_rate": 2.225298054350524e-06, + "loss": 2.5661, + "step": 9221000 + }, + { + "epoch": 2.8666375490194556, + "grad_norm": 9.518465995788574, + "learning_rate": 2.222707516342409e-06, + "loss": 2.6014, + "step": 9221500 + }, + { + "epoch": 2.8667929812999424, + "grad_norm": 11.840365409851074, + "learning_rate": 2.2201169783342948e-06, + "loss": 2.5647, + "step": 9222000 + }, + { + "epoch": 2.8669484135804293, + "grad_norm": 14.035515785217285, + "learning_rate": 2.21752644032618e-06, + "loss": 2.5855, + "step": 9222500 + }, + { + "epoch": 2.867103845860916, + "grad_norm": 12.11801815032959, + "learning_rate": 2.2149359023180653e-06, + "loss": 2.5578, + "step": 9223000 + }, + { + "epoch": 2.867259278141403, + "grad_norm": 11.916824340820312, + "learning_rate": 2.2123453643099507e-06, + "loss": 2.5552, + "step": 9223500 + }, + { + "epoch": 2.86741471042189, + "grad_norm": 11.601402282714844, + "learning_rate": 2.2097548263018358e-06, + "loss": 2.5622, + "step": 9224000 + }, + { + "epoch": 2.8675701427023768, + "grad_norm": 9.940378189086914, + "learning_rate": 2.2071642882937217e-06, + "loss": 2.5947, + "step": 9224500 + }, + { + "epoch": 2.8677255749828636, + "grad_norm": 8.06032943725586, + "learning_rate": 2.204573750285607e-06, + "loss": 2.5043, + "step": 9225000 + }, + { + "epoch": 2.8678810072633505, + "grad_norm": 11.151515007019043, + "learning_rate": 2.201983212277492e-06, + "loss": 2.5253, + "step": 9225500 + }, + { + "epoch": 2.8680364395438374, + "grad_norm": 9.080103874206543, + "learning_rate": 2.1993926742693776e-06, + "loss": 2.5478, + "step": 9226000 + }, + { + "epoch": 2.8681918718243242, + "grad_norm": 8.787680625915527, + "learning_rate": 2.196802136261263e-06, + "loss": 2.5742, + "step": 9226500 + }, + { + "epoch": 2.868347304104811, + "grad_norm": 15.028587341308594, + "learning_rate": 2.1942115982531485e-06, + "loss": 2.5818, + "step": 9227000 + }, + { + "epoch": 2.868502736385298, + "grad_norm": 9.897826194763184, + "learning_rate": 2.191621060245034e-06, + "loss": 2.5775, + "step": 9227500 + }, + { + "epoch": 2.868658168665785, + "grad_norm": 9.328222274780273, + "learning_rate": 2.189030522236919e-06, + "loss": 2.546, + "step": 9228000 + }, + { + "epoch": 2.8688136009462717, + "grad_norm": 9.621133804321289, + "learning_rate": 2.1864399842288045e-06, + "loss": 2.591, + "step": 9228500 + }, + { + "epoch": 2.8689690332267586, + "grad_norm": 9.053793907165527, + "learning_rate": 2.1838494462206904e-06, + "loss": 2.5542, + "step": 9229000 + }, + { + "epoch": 2.8691244655072454, + "grad_norm": 9.028112411499023, + "learning_rate": 2.1812589082125754e-06, + "loss": 2.5857, + "step": 9229500 + }, + { + "epoch": 2.8692798977877323, + "grad_norm": 10.09740161895752, + "learning_rate": 2.178668370204461e-06, + "loss": 2.5807, + "step": 9230000 + }, + { + "epoch": 2.869435330068219, + "grad_norm": 9.435855865478516, + "learning_rate": 2.1760778321963463e-06, + "loss": 2.5879, + "step": 9230500 + }, + { + "epoch": 2.869590762348706, + "grad_norm": 8.940279960632324, + "learning_rate": 2.1734872941882318e-06, + "loss": 2.4904, + "step": 9231000 + }, + { + "epoch": 2.869746194629193, + "grad_norm": 10.200088500976562, + "learning_rate": 2.1708967561801172e-06, + "loss": 2.5385, + "step": 9231500 + }, + { + "epoch": 2.8699016269096798, + "grad_norm": 13.319586753845215, + "learning_rate": 2.1683062181720027e-06, + "loss": 2.5693, + "step": 9232000 + }, + { + "epoch": 2.8700570591901666, + "grad_norm": 11.183283805847168, + "learning_rate": 2.1657156801638877e-06, + "loss": 2.583, + "step": 9232500 + }, + { + "epoch": 2.8702124914706535, + "grad_norm": 10.230856895446777, + "learning_rate": 2.163125142155773e-06, + "loss": 2.5523, + "step": 9233000 + }, + { + "epoch": 2.8703679237511404, + "grad_norm": 10.740245819091797, + "learning_rate": 2.1605346041476587e-06, + "loss": 2.5601, + "step": 9233500 + }, + { + "epoch": 2.8705233560316272, + "grad_norm": 11.08126163482666, + "learning_rate": 2.157944066139544e-06, + "loss": 2.5943, + "step": 9234000 + }, + { + "epoch": 2.870678788312114, + "grad_norm": 12.26414680480957, + "learning_rate": 2.1553535281314296e-06, + "loss": 2.5079, + "step": 9234500 + }, + { + "epoch": 2.870834220592601, + "grad_norm": 12.418335914611816, + "learning_rate": 2.1527629901233146e-06, + "loss": 2.5583, + "step": 9235000 + }, + { + "epoch": 2.870989652873088, + "grad_norm": 11.869806289672852, + "learning_rate": 2.1501724521152005e-06, + "loss": 2.5432, + "step": 9235500 + }, + { + "epoch": 2.871145085153575, + "grad_norm": 8.265896797180176, + "learning_rate": 2.147581914107086e-06, + "loss": 2.5869, + "step": 9236000 + }, + { + "epoch": 2.8713005174340616, + "grad_norm": 6.976011276245117, + "learning_rate": 2.144991376098971e-06, + "loss": 2.5226, + "step": 9236500 + }, + { + "epoch": 2.871455949714549, + "grad_norm": 10.983071327209473, + "learning_rate": 2.1424008380908564e-06, + "loss": 2.5412, + "step": 9237000 + }, + { + "epoch": 2.8716113819950353, + "grad_norm": 12.95893669128418, + "learning_rate": 2.139810300082742e-06, + "loss": 2.5347, + "step": 9237500 + }, + { + "epoch": 2.8717668142755226, + "grad_norm": 7.354109287261963, + "learning_rate": 2.1372197620746274e-06, + "loss": 2.6, + "step": 9238000 + }, + { + "epoch": 2.871922246556009, + "grad_norm": 39.932586669921875, + "learning_rate": 2.134629224066513e-06, + "loss": 2.5603, + "step": 9238500 + }, + { + "epoch": 2.8720776788364963, + "grad_norm": 9.11042308807373, + "learning_rate": 2.132038686058398e-06, + "loss": 2.5276, + "step": 9239000 + }, + { + "epoch": 2.8722331111169828, + "grad_norm": 10.93744945526123, + "learning_rate": 2.1294481480502833e-06, + "loss": 2.5866, + "step": 9239500 + }, + { + "epoch": 2.87238854339747, + "grad_norm": 10.496177673339844, + "learning_rate": 2.126857610042169e-06, + "loss": 2.5679, + "step": 9240000 + }, + { + "epoch": 2.8725439756779565, + "grad_norm": 10.207171440124512, + "learning_rate": 2.1242670720340542e-06, + "loss": 2.5909, + "step": 9240500 + }, + { + "epoch": 2.872699407958444, + "grad_norm": 13.942031860351562, + "learning_rate": 2.1216765340259397e-06, + "loss": 2.5842, + "step": 9241000 + }, + { + "epoch": 2.8728548402389302, + "grad_norm": 8.666959762573242, + "learning_rate": 2.119085996017825e-06, + "loss": 2.5879, + "step": 9241500 + }, + { + "epoch": 2.8730102725194175, + "grad_norm": 10.008528709411621, + "learning_rate": 2.11649545800971e-06, + "loss": 2.5307, + "step": 9242000 + }, + { + "epoch": 2.8731657047999044, + "grad_norm": 10.815345764160156, + "learning_rate": 2.113904920001596e-06, + "loss": 2.5608, + "step": 9242500 + }, + { + "epoch": 2.8733211370803913, + "grad_norm": 10.199846267700195, + "learning_rate": 2.111314381993481e-06, + "loss": 2.5518, + "step": 9243000 + }, + { + "epoch": 2.873476569360878, + "grad_norm": 9.920417785644531, + "learning_rate": 2.1087238439853666e-06, + "loss": 2.5488, + "step": 9243500 + }, + { + "epoch": 2.873632001641365, + "grad_norm": 12.579082489013672, + "learning_rate": 2.106133305977252e-06, + "loss": 2.5415, + "step": 9244000 + }, + { + "epoch": 2.873787433921852, + "grad_norm": 13.554028511047363, + "learning_rate": 2.1035427679691375e-06, + "loss": 2.5489, + "step": 9244500 + }, + { + "epoch": 2.8739428662023387, + "grad_norm": 12.16521167755127, + "learning_rate": 2.100952229961023e-06, + "loss": 2.5451, + "step": 9245000 + }, + { + "epoch": 2.8740982984828256, + "grad_norm": 13.262414932250977, + "learning_rate": 2.0983616919529084e-06, + "loss": 2.567, + "step": 9245500 + }, + { + "epoch": 2.8742537307633125, + "grad_norm": 12.75589656829834, + "learning_rate": 2.0957711539447934e-06, + "loss": 2.5418, + "step": 9246000 + }, + { + "epoch": 2.8744091630437993, + "grad_norm": 12.123448371887207, + "learning_rate": 2.0931806159366793e-06, + "loss": 2.5572, + "step": 9246500 + }, + { + "epoch": 2.874564595324286, + "grad_norm": 10.143647193908691, + "learning_rate": 2.0905900779285644e-06, + "loss": 2.5368, + "step": 9247000 + }, + { + "epoch": 2.874720027604773, + "grad_norm": 9.986997604370117, + "learning_rate": 2.08799953992045e-06, + "loss": 2.6097, + "step": 9247500 + }, + { + "epoch": 2.87487545988526, + "grad_norm": 9.561484336853027, + "learning_rate": 2.0854090019123353e-06, + "loss": 2.5791, + "step": 9248000 + }, + { + "epoch": 2.875030892165747, + "grad_norm": 11.54172134399414, + "learning_rate": 2.0828184639042203e-06, + "loss": 2.5689, + "step": 9248500 + }, + { + "epoch": 2.8751863244462337, + "grad_norm": 9.135892868041992, + "learning_rate": 2.080227925896106e-06, + "loss": 2.5888, + "step": 9249000 + }, + { + "epoch": 2.8753417567267205, + "grad_norm": 9.052690505981445, + "learning_rate": 2.0776373878879917e-06, + "loss": 2.5445, + "step": 9249500 + }, + { + "epoch": 2.8754971890072074, + "grad_norm": 8.067643165588379, + "learning_rate": 2.0750468498798767e-06, + "loss": 2.5739, + "step": 9250000 + }, + { + "epoch": 2.8756526212876943, + "grad_norm": 9.161531448364258, + "learning_rate": 2.072456311871762e-06, + "loss": 2.5258, + "step": 9250500 + }, + { + "epoch": 2.875808053568181, + "grad_norm": 8.093664169311523, + "learning_rate": 2.0698657738636476e-06, + "loss": 2.5737, + "step": 9251000 + }, + { + "epoch": 2.875963485848668, + "grad_norm": 11.113117218017578, + "learning_rate": 2.067275235855533e-06, + "loss": 2.5786, + "step": 9251500 + }, + { + "epoch": 2.876118918129155, + "grad_norm": 9.710968017578125, + "learning_rate": 2.0646846978474185e-06, + "loss": 2.5725, + "step": 9252000 + }, + { + "epoch": 2.8762743504096417, + "grad_norm": 10.87992000579834, + "learning_rate": 2.0620941598393036e-06, + "loss": 2.5432, + "step": 9252500 + }, + { + "epoch": 2.8764297826901286, + "grad_norm": 10.460853576660156, + "learning_rate": 2.059503621831189e-06, + "loss": 2.5329, + "step": 9253000 + }, + { + "epoch": 2.8765852149706155, + "grad_norm": 18.534034729003906, + "learning_rate": 2.056913083823075e-06, + "loss": 2.5371, + "step": 9253500 + }, + { + "epoch": 2.8767406472511023, + "grad_norm": 10.561006546020508, + "learning_rate": 2.05432254581496e-06, + "loss": 2.5665, + "step": 9254000 + }, + { + "epoch": 2.876896079531589, + "grad_norm": 21.983779907226562, + "learning_rate": 2.0517320078068454e-06, + "loss": 2.5551, + "step": 9254500 + }, + { + "epoch": 2.877051511812076, + "grad_norm": 11.654129028320312, + "learning_rate": 2.049141469798731e-06, + "loss": 2.5157, + "step": 9255000 + }, + { + "epoch": 2.877206944092563, + "grad_norm": 9.102530479431152, + "learning_rate": 2.0465509317906163e-06, + "loss": 2.5515, + "step": 9255500 + }, + { + "epoch": 2.87736237637305, + "grad_norm": 9.537543296813965, + "learning_rate": 2.0439603937825018e-06, + "loss": 2.5296, + "step": 9256000 + }, + { + "epoch": 2.8775178086535367, + "grad_norm": 8.547029495239258, + "learning_rate": 2.0413698557743872e-06, + "loss": 2.5335, + "step": 9256500 + }, + { + "epoch": 2.8776732409340235, + "grad_norm": 9.013689994812012, + "learning_rate": 2.0387793177662723e-06, + "loss": 2.523, + "step": 9257000 + }, + { + "epoch": 2.8778286732145104, + "grad_norm": 9.218417167663574, + "learning_rate": 2.0361887797581577e-06, + "loss": 2.5752, + "step": 9257500 + }, + { + "epoch": 2.8779841054949973, + "grad_norm": 10.67524242401123, + "learning_rate": 2.033598241750043e-06, + "loss": 2.553, + "step": 9258000 + }, + { + "epoch": 2.878139537775484, + "grad_norm": 9.885542869567871, + "learning_rate": 2.0310077037419287e-06, + "loss": 2.5387, + "step": 9258500 + }, + { + "epoch": 2.878294970055971, + "grad_norm": 9.539007186889648, + "learning_rate": 2.028417165733814e-06, + "loss": 2.5498, + "step": 9259000 + }, + { + "epoch": 2.878450402336458, + "grad_norm": 10.2242431640625, + "learning_rate": 2.025826627725699e-06, + "loss": 2.5781, + "step": 9259500 + }, + { + "epoch": 2.8786058346169447, + "grad_norm": 19.11222267150879, + "learning_rate": 2.023236089717585e-06, + "loss": 2.5717, + "step": 9260000 + }, + { + "epoch": 2.878761266897432, + "grad_norm": 8.616496086120605, + "learning_rate": 2.0206455517094705e-06, + "loss": 2.5634, + "step": 9260500 + }, + { + "epoch": 2.8789166991779185, + "grad_norm": 11.65404224395752, + "learning_rate": 2.0180550137013555e-06, + "loss": 2.5952, + "step": 9261000 + }, + { + "epoch": 2.879072131458406, + "grad_norm": 12.244885444641113, + "learning_rate": 2.015464475693241e-06, + "loss": 2.5151, + "step": 9261500 + }, + { + "epoch": 2.879227563738892, + "grad_norm": 14.731833457946777, + "learning_rate": 2.0128739376851264e-06, + "loss": 2.5231, + "step": 9262000 + }, + { + "epoch": 2.8793829960193795, + "grad_norm": 6.491483688354492, + "learning_rate": 2.010283399677012e-06, + "loss": 2.5214, + "step": 9262500 + }, + { + "epoch": 2.879538428299866, + "grad_norm": 9.513086318969727, + "learning_rate": 2.0076928616688974e-06, + "loss": 2.5836, + "step": 9263000 + }, + { + "epoch": 2.8796938605803533, + "grad_norm": 11.503996849060059, + "learning_rate": 2.0051023236607824e-06, + "loss": 2.5314, + "step": 9263500 + }, + { + "epoch": 2.8798492928608397, + "grad_norm": 10.111282348632812, + "learning_rate": 2.002511785652668e-06, + "loss": 2.5553, + "step": 9264000 + }, + { + "epoch": 2.880004725141327, + "grad_norm": 9.449530601501465, + "learning_rate": 1.9999212476445537e-06, + "loss": 2.5714, + "step": 9264500 + }, + { + "epoch": 2.8801601574218134, + "grad_norm": 7.428249835968018, + "learning_rate": 1.9973307096364388e-06, + "loss": 2.531, + "step": 9265000 + }, + { + "epoch": 2.8803155897023007, + "grad_norm": 16.849943161010742, + "learning_rate": 1.9947401716283242e-06, + "loss": 2.5512, + "step": 9265500 + }, + { + "epoch": 2.8804710219827876, + "grad_norm": 7.566002368927002, + "learning_rate": 1.9921496336202097e-06, + "loss": 2.5127, + "step": 9266000 + }, + { + "epoch": 2.8806264542632745, + "grad_norm": 8.944548606872559, + "learning_rate": 1.9895590956120947e-06, + "loss": 2.4943, + "step": 9266500 + }, + { + "epoch": 2.8807818865437613, + "grad_norm": 11.525050163269043, + "learning_rate": 1.9869685576039806e-06, + "loss": 2.5065, + "step": 9267000 + }, + { + "epoch": 2.880937318824248, + "grad_norm": 16.412410736083984, + "learning_rate": 1.9843780195958657e-06, + "loss": 2.5317, + "step": 9267500 + }, + { + "epoch": 2.881092751104735, + "grad_norm": 12.761841773986816, + "learning_rate": 1.981787481587751e-06, + "loss": 2.5125, + "step": 9268000 + }, + { + "epoch": 2.881248183385222, + "grad_norm": 11.558201789855957, + "learning_rate": 1.9791969435796366e-06, + "loss": 2.5589, + "step": 9268500 + }, + { + "epoch": 2.881403615665709, + "grad_norm": 9.054408073425293, + "learning_rate": 1.976606405571522e-06, + "loss": 2.5719, + "step": 9269000 + }, + { + "epoch": 2.8815590479461957, + "grad_norm": 8.50940990447998, + "learning_rate": 1.9740158675634075e-06, + "loss": 2.5131, + "step": 9269500 + }, + { + "epoch": 2.8817144802266825, + "grad_norm": 10.611186981201172, + "learning_rate": 1.971425329555293e-06, + "loss": 2.4756, + "step": 9270000 + }, + { + "epoch": 2.8818699125071694, + "grad_norm": 12.758356094360352, + "learning_rate": 1.968834791547178e-06, + "loss": 2.5601, + "step": 9270500 + }, + { + "epoch": 2.8820253447876563, + "grad_norm": 11.669154167175293, + "learning_rate": 1.9662442535390634e-06, + "loss": 2.496, + "step": 9271000 + }, + { + "epoch": 2.882180777068143, + "grad_norm": 12.70785903930664, + "learning_rate": 1.963653715530949e-06, + "loss": 2.5943, + "step": 9271500 + }, + { + "epoch": 2.88233620934863, + "grad_norm": 8.9775390625, + "learning_rate": 1.9610631775228344e-06, + "loss": 2.5258, + "step": 9272000 + }, + { + "epoch": 2.882491641629117, + "grad_norm": 11.029213905334473, + "learning_rate": 1.95847263951472e-06, + "loss": 2.5618, + "step": 9272500 + }, + { + "epoch": 2.8826470739096037, + "grad_norm": 10.154243469238281, + "learning_rate": 1.955882101506605e-06, + "loss": 2.545, + "step": 9273000 + }, + { + "epoch": 2.8828025061900906, + "grad_norm": 10.3358154296875, + "learning_rate": 1.9532915634984907e-06, + "loss": 2.575, + "step": 9273500 + }, + { + "epoch": 2.8829579384705775, + "grad_norm": 16.932872772216797, + "learning_rate": 1.950701025490376e-06, + "loss": 2.5121, + "step": 9274000 + }, + { + "epoch": 2.8831133707510643, + "grad_norm": 9.409358024597168, + "learning_rate": 1.9481104874822612e-06, + "loss": 2.5783, + "step": 9274500 + }, + { + "epoch": 2.883268803031551, + "grad_norm": 13.882746696472168, + "learning_rate": 1.9455199494741467e-06, + "loss": 2.5479, + "step": 9275000 + }, + { + "epoch": 2.883424235312038, + "grad_norm": 9.075703620910645, + "learning_rate": 1.942929411466032e-06, + "loss": 2.5921, + "step": 9275500 + }, + { + "epoch": 2.883579667592525, + "grad_norm": 9.396488189697266, + "learning_rate": 1.9403388734579176e-06, + "loss": 2.568, + "step": 9276000 + }, + { + "epoch": 2.883735099873012, + "grad_norm": 9.0136137008667, + "learning_rate": 1.937748335449803e-06, + "loss": 2.5903, + "step": 9276500 + }, + { + "epoch": 2.8838905321534987, + "grad_norm": 11.040788650512695, + "learning_rate": 1.935157797441688e-06, + "loss": 2.5529, + "step": 9277000 + }, + { + "epoch": 2.8840459644339855, + "grad_norm": 8.179075241088867, + "learning_rate": 1.9325672594335736e-06, + "loss": 2.5198, + "step": 9277500 + }, + { + "epoch": 2.8842013967144724, + "grad_norm": 11.278197288513184, + "learning_rate": 1.9299767214254595e-06, + "loss": 2.5519, + "step": 9278000 + }, + { + "epoch": 2.8843568289949593, + "grad_norm": 8.946014404296875, + "learning_rate": 1.9273861834173445e-06, + "loss": 2.5482, + "step": 9278500 + }, + { + "epoch": 2.884512261275446, + "grad_norm": 9.341863632202148, + "learning_rate": 1.92479564540923e-06, + "loss": 2.5609, + "step": 9279000 + }, + { + "epoch": 2.884667693555933, + "grad_norm": 8.964146614074707, + "learning_rate": 1.9222051074011154e-06, + "loss": 2.5403, + "step": 9279500 + }, + { + "epoch": 2.88482312583642, + "grad_norm": 22.718673706054688, + "learning_rate": 1.919614569393001e-06, + "loss": 2.5762, + "step": 9280000 + }, + { + "epoch": 2.8849785581169067, + "grad_norm": 8.792386054992676, + "learning_rate": 1.9170240313848863e-06, + "loss": 2.5345, + "step": 9280500 + }, + { + "epoch": 2.8851339903973936, + "grad_norm": 11.229283332824707, + "learning_rate": 1.9144334933767718e-06, + "loss": 2.5771, + "step": 9281000 + }, + { + "epoch": 2.8852894226778805, + "grad_norm": 9.075122833251953, + "learning_rate": 1.911842955368657e-06, + "loss": 2.6031, + "step": 9281500 + }, + { + "epoch": 2.8854448549583673, + "grad_norm": 9.507908821105957, + "learning_rate": 1.9092524173605423e-06, + "loss": 2.5719, + "step": 9282000 + }, + { + "epoch": 2.885600287238854, + "grad_norm": 15.350672721862793, + "learning_rate": 1.906661879352428e-06, + "loss": 2.4912, + "step": 9282500 + }, + { + "epoch": 2.885755719519341, + "grad_norm": 11.064205169677734, + "learning_rate": 1.9040713413443132e-06, + "loss": 2.545, + "step": 9283000 + }, + { + "epoch": 2.885911151799828, + "grad_norm": 8.880244255065918, + "learning_rate": 1.9014808033361984e-06, + "loss": 2.5526, + "step": 9283500 + }, + { + "epoch": 2.8860665840803152, + "grad_norm": 12.616994857788086, + "learning_rate": 1.898890265328084e-06, + "loss": 2.558, + "step": 9284000 + }, + { + "epoch": 2.8862220163608017, + "grad_norm": 12.57839298248291, + "learning_rate": 1.8962997273199696e-06, + "loss": 2.5752, + "step": 9284500 + }, + { + "epoch": 2.886377448641289, + "grad_norm": 10.727341651916504, + "learning_rate": 1.8937091893118548e-06, + "loss": 2.5741, + "step": 9285000 + }, + { + "epoch": 2.8865328809217754, + "grad_norm": 11.693695068359375, + "learning_rate": 1.89111865130374e-06, + "loss": 2.5588, + "step": 9285500 + }, + { + "epoch": 2.8866883132022627, + "grad_norm": 8.691823959350586, + "learning_rate": 1.8885281132956255e-06, + "loss": 2.5844, + "step": 9286000 + }, + { + "epoch": 2.886843745482749, + "grad_norm": 8.14132022857666, + "learning_rate": 1.8859375752875108e-06, + "loss": 2.522, + "step": 9286500 + }, + { + "epoch": 2.8869991777632364, + "grad_norm": 8.96260929107666, + "learning_rate": 1.8833470372793964e-06, + "loss": 2.5593, + "step": 9287000 + }, + { + "epoch": 2.887154610043723, + "grad_norm": 7.08823299407959, + "learning_rate": 1.8807564992712817e-06, + "loss": 2.5628, + "step": 9287500 + }, + { + "epoch": 2.88731004232421, + "grad_norm": 9.78604507446289, + "learning_rate": 1.8781659612631672e-06, + "loss": 2.5276, + "step": 9288000 + }, + { + "epoch": 2.8874654746046966, + "grad_norm": 9.764583587646484, + "learning_rate": 1.8755754232550524e-06, + "loss": 2.561, + "step": 9288500 + }, + { + "epoch": 2.887620906885184, + "grad_norm": 11.478694915771484, + "learning_rate": 1.872984885246938e-06, + "loss": 2.5767, + "step": 9289000 + }, + { + "epoch": 2.8877763391656703, + "grad_norm": 8.878772735595703, + "learning_rate": 1.8703943472388233e-06, + "loss": 2.5638, + "step": 9289500 + }, + { + "epoch": 2.8879317714461576, + "grad_norm": 11.025714874267578, + "learning_rate": 1.8678038092307088e-06, + "loss": 2.5588, + "step": 9290000 + }, + { + "epoch": 2.8880872037266445, + "grad_norm": 10.136838912963867, + "learning_rate": 1.865213271222594e-06, + "loss": 2.5921, + "step": 9290500 + }, + { + "epoch": 2.8882426360071314, + "grad_norm": 9.034015655517578, + "learning_rate": 1.8626227332144795e-06, + "loss": 2.5836, + "step": 9291000 + }, + { + "epoch": 2.8883980682876182, + "grad_norm": 17.16301918029785, + "learning_rate": 1.8600321952063652e-06, + "loss": 2.5936, + "step": 9291500 + }, + { + "epoch": 2.888553500568105, + "grad_norm": 12.387414932250977, + "learning_rate": 1.8574416571982504e-06, + "loss": 2.5528, + "step": 9292000 + }, + { + "epoch": 2.888708932848592, + "grad_norm": 28.864797592163086, + "learning_rate": 1.8548511191901357e-06, + "loss": 2.5319, + "step": 9292500 + }, + { + "epoch": 2.888864365129079, + "grad_norm": 10.883344650268555, + "learning_rate": 1.8522605811820211e-06, + "loss": 2.551, + "step": 9293000 + }, + { + "epoch": 2.8890197974095657, + "grad_norm": 9.464073181152344, + "learning_rate": 1.8496700431739068e-06, + "loss": 2.5594, + "step": 9293500 + }, + { + "epoch": 2.8891752296900526, + "grad_norm": 10.955820083618164, + "learning_rate": 1.847079505165792e-06, + "loss": 2.5499, + "step": 9294000 + }, + { + "epoch": 2.8893306619705394, + "grad_norm": 11.198522567749023, + "learning_rate": 1.8444889671576773e-06, + "loss": 2.5323, + "step": 9294500 + }, + { + "epoch": 2.8894860942510263, + "grad_norm": 8.77892017364502, + "learning_rate": 1.8418984291495627e-06, + "loss": 2.5761, + "step": 9295000 + }, + { + "epoch": 2.889641526531513, + "grad_norm": 8.940411567687988, + "learning_rate": 1.839307891141448e-06, + "loss": 2.5224, + "step": 9295500 + }, + { + "epoch": 2.889796958812, + "grad_norm": 10.125676155090332, + "learning_rate": 1.8367173531333337e-06, + "loss": 2.5537, + "step": 9296000 + }, + { + "epoch": 2.889952391092487, + "grad_norm": 7.247826099395752, + "learning_rate": 1.834126815125219e-06, + "loss": 2.5394, + "step": 9296500 + }, + { + "epoch": 2.890107823372974, + "grad_norm": 15.234710693359375, + "learning_rate": 1.8315362771171044e-06, + "loss": 2.5554, + "step": 9297000 + }, + { + "epoch": 2.8902632556534607, + "grad_norm": 8.27919864654541, + "learning_rate": 1.8289457391089896e-06, + "loss": 2.5212, + "step": 9297500 + }, + { + "epoch": 2.8904186879339475, + "grad_norm": 10.2689208984375, + "learning_rate": 1.8263552011008753e-06, + "loss": 2.5555, + "step": 9298000 + }, + { + "epoch": 2.8905741202144344, + "grad_norm": 12.704341888427734, + "learning_rate": 1.8237646630927605e-06, + "loss": 2.5592, + "step": 9298500 + }, + { + "epoch": 2.8907295524949213, + "grad_norm": 32.21665954589844, + "learning_rate": 1.821174125084646e-06, + "loss": 2.5246, + "step": 9299000 + }, + { + "epoch": 2.890884984775408, + "grad_norm": 12.211770057678223, + "learning_rate": 1.8185835870765312e-06, + "loss": 2.5413, + "step": 9299500 + }, + { + "epoch": 2.891040417055895, + "grad_norm": 8.827033042907715, + "learning_rate": 1.8159930490684165e-06, + "loss": 2.523, + "step": 9300000 + }, + { + "epoch": 2.891195849336382, + "grad_norm": 9.815950393676758, + "learning_rate": 1.8134025110603022e-06, + "loss": 2.5812, + "step": 9300500 + }, + { + "epoch": 2.8913512816168687, + "grad_norm": 11.696491241455078, + "learning_rate": 1.8108119730521876e-06, + "loss": 2.4946, + "step": 9301000 + }, + { + "epoch": 2.8915067138973556, + "grad_norm": 10.549017906188965, + "learning_rate": 1.8082214350440729e-06, + "loss": 2.5301, + "step": 9301500 + }, + { + "epoch": 2.8916621461778425, + "grad_norm": 16.261106491088867, + "learning_rate": 1.8056308970359581e-06, + "loss": 2.5255, + "step": 9302000 + }, + { + "epoch": 2.8918175784583293, + "grad_norm": 12.339995384216309, + "learning_rate": 1.8030403590278438e-06, + "loss": 2.5574, + "step": 9302500 + }, + { + "epoch": 2.891973010738816, + "grad_norm": 19.897300720214844, + "learning_rate": 1.8004498210197292e-06, + "loss": 2.5493, + "step": 9303000 + }, + { + "epoch": 2.892128443019303, + "grad_norm": 10.6024808883667, + "learning_rate": 1.7978592830116145e-06, + "loss": 2.5682, + "step": 9303500 + }, + { + "epoch": 2.89228387529979, + "grad_norm": 9.347624778747559, + "learning_rate": 1.7952687450034997e-06, + "loss": 2.5723, + "step": 9304000 + }, + { + "epoch": 2.892439307580277, + "grad_norm": 8.91598892211914, + "learning_rate": 1.7926782069953852e-06, + "loss": 2.5415, + "step": 9304500 + }, + { + "epoch": 2.8925947398607637, + "grad_norm": 10.483896255493164, + "learning_rate": 1.7900876689872709e-06, + "loss": 2.5385, + "step": 9305000 + }, + { + "epoch": 2.8927501721412505, + "grad_norm": 11.824641227722168, + "learning_rate": 1.7874971309791561e-06, + "loss": 2.5381, + "step": 9305500 + }, + { + "epoch": 2.8929056044217374, + "grad_norm": 15.963720321655273, + "learning_rate": 1.7849065929710414e-06, + "loss": 2.5178, + "step": 9306000 + }, + { + "epoch": 2.8930610367022243, + "grad_norm": 10.476840019226074, + "learning_rate": 1.7823160549629268e-06, + "loss": 2.5744, + "step": 9306500 + }, + { + "epoch": 2.893216468982711, + "grad_norm": 10.2554292678833, + "learning_rate": 1.7797255169548125e-06, + "loss": 2.5578, + "step": 9307000 + }, + { + "epoch": 2.893371901263198, + "grad_norm": 10.15816593170166, + "learning_rate": 1.7771349789466977e-06, + "loss": 2.5066, + "step": 9307500 + }, + { + "epoch": 2.893527333543685, + "grad_norm": 11.194283485412598, + "learning_rate": 1.774544440938583e-06, + "loss": 2.5812, + "step": 9308000 + }, + { + "epoch": 2.893682765824172, + "grad_norm": 18.754703521728516, + "learning_rate": 1.7719539029304684e-06, + "loss": 2.5332, + "step": 9308500 + }, + { + "epoch": 2.8938381981046586, + "grad_norm": 8.277291297912598, + "learning_rate": 1.7693633649223537e-06, + "loss": 2.5386, + "step": 9309000 + }, + { + "epoch": 2.893993630385146, + "grad_norm": 14.207059860229492, + "learning_rate": 1.7667728269142394e-06, + "loss": 2.6187, + "step": 9309500 + }, + { + "epoch": 2.8941490626656323, + "grad_norm": 10.16219711303711, + "learning_rate": 1.7641822889061246e-06, + "loss": 2.5455, + "step": 9310000 + }, + { + "epoch": 2.8943044949461196, + "grad_norm": 11.195525169372559, + "learning_rate": 1.76159175089801e-06, + "loss": 2.5716, + "step": 9310500 + }, + { + "epoch": 2.894459927226606, + "grad_norm": 8.884042739868164, + "learning_rate": 1.7590012128898953e-06, + "loss": 2.5339, + "step": 9311000 + }, + { + "epoch": 2.8946153595070934, + "grad_norm": 18.560678482055664, + "learning_rate": 1.756410674881781e-06, + "loss": 2.5646, + "step": 9311500 + }, + { + "epoch": 2.89477079178758, + "grad_norm": 8.443316459655762, + "learning_rate": 1.7538201368736662e-06, + "loss": 2.5925, + "step": 9312000 + }, + { + "epoch": 2.894926224068067, + "grad_norm": 8.89537239074707, + "learning_rate": 1.7512295988655517e-06, + "loss": 2.5466, + "step": 9312500 + }, + { + "epoch": 2.8950816563485535, + "grad_norm": 23.23969078063965, + "learning_rate": 1.748639060857437e-06, + "loss": 2.5221, + "step": 9313000 + }, + { + "epoch": 2.895237088629041, + "grad_norm": 9.232563018798828, + "learning_rate": 1.7460485228493226e-06, + "loss": 2.5539, + "step": 9313500 + }, + { + "epoch": 2.8953925209095273, + "grad_norm": 10.560479164123535, + "learning_rate": 1.7434579848412079e-06, + "loss": 2.5359, + "step": 9314000 + }, + { + "epoch": 2.8955479531900146, + "grad_norm": 13.420697212219238, + "learning_rate": 1.7408674468330933e-06, + "loss": 2.5373, + "step": 9314500 + }, + { + "epoch": 2.8957033854705014, + "grad_norm": 9.772665977478027, + "learning_rate": 1.7382769088249786e-06, + "loss": 2.5406, + "step": 9315000 + }, + { + "epoch": 2.8958588177509883, + "grad_norm": 7.896320343017578, + "learning_rate": 1.735686370816864e-06, + "loss": 2.581, + "step": 9315500 + }, + { + "epoch": 2.896014250031475, + "grad_norm": 12.98435115814209, + "learning_rate": 1.7330958328087497e-06, + "loss": 2.5422, + "step": 9316000 + }, + { + "epoch": 2.896169682311962, + "grad_norm": 9.72835636138916, + "learning_rate": 1.730505294800635e-06, + "loss": 2.5852, + "step": 9316500 + }, + { + "epoch": 2.896325114592449, + "grad_norm": 10.354592323303223, + "learning_rate": 1.7279147567925202e-06, + "loss": 2.5733, + "step": 9317000 + }, + { + "epoch": 2.8964805468729358, + "grad_norm": 9.28518295288086, + "learning_rate": 1.7253242187844057e-06, + "loss": 2.6008, + "step": 9317500 + }, + { + "epoch": 2.8966359791534226, + "grad_norm": 21.359203338623047, + "learning_rate": 1.7227336807762913e-06, + "loss": 2.5029, + "step": 9318000 + }, + { + "epoch": 2.8967914114339095, + "grad_norm": 20.933429718017578, + "learning_rate": 1.7201431427681766e-06, + "loss": 2.5836, + "step": 9318500 + }, + { + "epoch": 2.8969468437143964, + "grad_norm": 9.823260307312012, + "learning_rate": 1.7175526047600618e-06, + "loss": 2.5662, + "step": 9319000 + }, + { + "epoch": 2.8971022759948832, + "grad_norm": 9.345246315002441, + "learning_rate": 1.7149620667519473e-06, + "loss": 2.5351, + "step": 9319500 + }, + { + "epoch": 2.89725770827537, + "grad_norm": 9.137433052062988, + "learning_rate": 1.7123715287438325e-06, + "loss": 2.6035, + "step": 9320000 + }, + { + "epoch": 2.897413140555857, + "grad_norm": 22.417211532592773, + "learning_rate": 1.7097809907357182e-06, + "loss": 2.5251, + "step": 9320500 + }, + { + "epoch": 2.897568572836344, + "grad_norm": 11.53814697265625, + "learning_rate": 1.7071904527276034e-06, + "loss": 2.6015, + "step": 9321000 + }, + { + "epoch": 2.8977240051168307, + "grad_norm": 15.008426666259766, + "learning_rate": 1.704599914719489e-06, + "loss": 2.5425, + "step": 9321500 + }, + { + "epoch": 2.8978794373973176, + "grad_norm": 9.419525146484375, + "learning_rate": 1.7020093767113742e-06, + "loss": 2.6085, + "step": 9322000 + }, + { + "epoch": 2.8980348696778044, + "grad_norm": 17.100067138671875, + "learning_rate": 1.6994188387032598e-06, + "loss": 2.5646, + "step": 9322500 + }, + { + "epoch": 2.8981903019582913, + "grad_norm": 16.923297882080078, + "learning_rate": 1.696828300695145e-06, + "loss": 2.5466, + "step": 9323000 + }, + { + "epoch": 2.898345734238778, + "grad_norm": 22.952165603637695, + "learning_rate": 1.6942377626870305e-06, + "loss": 2.565, + "step": 9323500 + }, + { + "epoch": 2.898501166519265, + "grad_norm": 19.683712005615234, + "learning_rate": 1.6916472246789158e-06, + "loss": 2.5706, + "step": 9324000 + }, + { + "epoch": 2.898656598799752, + "grad_norm": 8.674336433410645, + "learning_rate": 1.689056686670801e-06, + "loss": 2.5475, + "step": 9324500 + }, + { + "epoch": 2.8988120310802388, + "grad_norm": 12.371286392211914, + "learning_rate": 1.6864661486626867e-06, + "loss": 2.5736, + "step": 9325000 + }, + { + "epoch": 2.8989674633607256, + "grad_norm": 8.971861839294434, + "learning_rate": 1.6838756106545722e-06, + "loss": 2.5227, + "step": 9325500 + }, + { + "epoch": 2.8991228956412125, + "grad_norm": 9.530074119567871, + "learning_rate": 1.6812850726464574e-06, + "loss": 2.5869, + "step": 9326000 + }, + { + "epoch": 2.8992783279216994, + "grad_norm": 11.036873817443848, + "learning_rate": 1.6786945346383427e-06, + "loss": 2.5598, + "step": 9326500 + }, + { + "epoch": 2.8994337602021862, + "grad_norm": 10.457618713378906, + "learning_rate": 1.6761039966302283e-06, + "loss": 2.5714, + "step": 9327000 + }, + { + "epoch": 2.899589192482673, + "grad_norm": 8.431673049926758, + "learning_rate": 1.6735134586221138e-06, + "loss": 2.6141, + "step": 9327500 + }, + { + "epoch": 2.89974462476316, + "grad_norm": 10.942034721374512, + "learning_rate": 1.670922920613999e-06, + "loss": 2.496, + "step": 9328000 + }, + { + "epoch": 2.899900057043647, + "grad_norm": 11.214948654174805, + "learning_rate": 1.6683323826058843e-06, + "loss": 2.5441, + "step": 9328500 + }, + { + "epoch": 2.9000554893241337, + "grad_norm": 15.569188117980957, + "learning_rate": 1.6657418445977697e-06, + "loss": 2.5565, + "step": 9329000 + }, + { + "epoch": 2.9002109216046206, + "grad_norm": 10.599376678466797, + "learning_rate": 1.6631513065896554e-06, + "loss": 2.5842, + "step": 9329500 + }, + { + "epoch": 2.9003663538851074, + "grad_norm": 10.528114318847656, + "learning_rate": 1.6605607685815407e-06, + "loss": 2.5927, + "step": 9330000 + }, + { + "epoch": 2.9005217861655943, + "grad_norm": 8.608526229858398, + "learning_rate": 1.657970230573426e-06, + "loss": 2.5588, + "step": 9330500 + }, + { + "epoch": 2.900677218446081, + "grad_norm": 9.618809700012207, + "learning_rate": 1.6553796925653114e-06, + "loss": 2.5612, + "step": 9331000 + }, + { + "epoch": 2.900832650726568, + "grad_norm": 11.06495189666748, + "learning_rate": 1.652789154557197e-06, + "loss": 2.5829, + "step": 9331500 + }, + { + "epoch": 2.900988083007055, + "grad_norm": 44.403011322021484, + "learning_rate": 1.6501986165490823e-06, + "loss": 2.5349, + "step": 9332000 + }, + { + "epoch": 2.9011435152875418, + "grad_norm": 9.73797607421875, + "learning_rate": 1.6476080785409675e-06, + "loss": 2.5905, + "step": 9332500 + }, + { + "epoch": 2.901298947568029, + "grad_norm": 9.281776428222656, + "learning_rate": 1.645017540532853e-06, + "loss": 2.5171, + "step": 9333000 + }, + { + "epoch": 2.9014543798485155, + "grad_norm": 10.36902141571045, + "learning_rate": 1.6424270025247382e-06, + "loss": 2.5779, + "step": 9333500 + }, + { + "epoch": 2.901609812129003, + "grad_norm": 10.731858253479004, + "learning_rate": 1.639836464516624e-06, + "loss": 2.5265, + "step": 9334000 + }, + { + "epoch": 2.9017652444094892, + "grad_norm": 8.803425788879395, + "learning_rate": 1.6372459265085092e-06, + "loss": 2.5544, + "step": 9334500 + }, + { + "epoch": 2.9019206766899766, + "grad_norm": 12.242776870727539, + "learning_rate": 1.6346553885003946e-06, + "loss": 2.5916, + "step": 9335000 + }, + { + "epoch": 2.902076108970463, + "grad_norm": 10.714262962341309, + "learning_rate": 1.6320648504922799e-06, + "loss": 2.5207, + "step": 9335500 + }, + { + "epoch": 2.9022315412509503, + "grad_norm": 10.375967979431152, + "learning_rate": 1.6294743124841655e-06, + "loss": 2.5518, + "step": 9336000 + }, + { + "epoch": 2.9023869735314367, + "grad_norm": 9.247485160827637, + "learning_rate": 1.6268837744760508e-06, + "loss": 2.542, + "step": 9336500 + }, + { + "epoch": 2.902542405811924, + "grad_norm": 9.952412605285645, + "learning_rate": 1.6242932364679362e-06, + "loss": 2.5452, + "step": 9337000 + }, + { + "epoch": 2.9026978380924104, + "grad_norm": 10.228850364685059, + "learning_rate": 1.6217026984598215e-06, + "loss": 2.5454, + "step": 9337500 + }, + { + "epoch": 2.9028532703728978, + "grad_norm": 12.389122009277344, + "learning_rate": 1.619112160451707e-06, + "loss": 2.5603, + "step": 9338000 + }, + { + "epoch": 2.9030087026533846, + "grad_norm": 9.839046478271484, + "learning_rate": 1.6165216224435924e-06, + "loss": 2.5649, + "step": 9338500 + }, + { + "epoch": 2.9031641349338715, + "grad_norm": 11.318297386169434, + "learning_rate": 1.6139310844354779e-06, + "loss": 2.5426, + "step": 9339000 + }, + { + "epoch": 2.9033195672143584, + "grad_norm": 10.475229263305664, + "learning_rate": 1.6113405464273631e-06, + "loss": 2.5685, + "step": 9339500 + }, + { + "epoch": 2.903474999494845, + "grad_norm": 10.01984691619873, + "learning_rate": 1.6087500084192486e-06, + "loss": 2.5439, + "step": 9340000 + }, + { + "epoch": 2.903630431775332, + "grad_norm": 11.846572875976562, + "learning_rate": 1.6061594704111342e-06, + "loss": 2.5844, + "step": 9340500 + }, + { + "epoch": 2.903785864055819, + "grad_norm": 26.08045196533203, + "learning_rate": 1.6035689324030195e-06, + "loss": 2.6031, + "step": 9341000 + }, + { + "epoch": 2.903941296336306, + "grad_norm": 10.375532150268555, + "learning_rate": 1.6009783943949047e-06, + "loss": 2.5779, + "step": 9341500 + }, + { + "epoch": 2.9040967286167927, + "grad_norm": 8.691535949707031, + "learning_rate": 1.5983878563867902e-06, + "loss": 2.5705, + "step": 9342000 + }, + { + "epoch": 2.9042521608972796, + "grad_norm": 9.555458068847656, + "learning_rate": 1.5957973183786754e-06, + "loss": 2.5248, + "step": 9342500 + }, + { + "epoch": 2.9044075931777664, + "grad_norm": 18.25539779663086, + "learning_rate": 1.5932067803705611e-06, + "loss": 2.5109, + "step": 9343000 + }, + { + "epoch": 2.9045630254582533, + "grad_norm": 10.278886795043945, + "learning_rate": 1.5906162423624464e-06, + "loss": 2.5395, + "step": 9343500 + }, + { + "epoch": 2.90471845773874, + "grad_norm": 11.77269172668457, + "learning_rate": 1.5880257043543318e-06, + "loss": 2.549, + "step": 9344000 + }, + { + "epoch": 2.904873890019227, + "grad_norm": 9.639198303222656, + "learning_rate": 1.585435166346217e-06, + "loss": 2.5469, + "step": 9344500 + }, + { + "epoch": 2.905029322299714, + "grad_norm": 11.523164749145508, + "learning_rate": 1.5828446283381027e-06, + "loss": 2.5241, + "step": 9345000 + }, + { + "epoch": 2.9051847545802008, + "grad_norm": 10.197734832763672, + "learning_rate": 1.580254090329988e-06, + "loss": 2.591, + "step": 9345500 + }, + { + "epoch": 2.9053401868606876, + "grad_norm": 10.565838813781738, + "learning_rate": 1.5776635523218734e-06, + "loss": 2.5596, + "step": 9346000 + }, + { + "epoch": 2.9054956191411745, + "grad_norm": 9.453608512878418, + "learning_rate": 1.5750730143137587e-06, + "loss": 2.5832, + "step": 9346500 + }, + { + "epoch": 2.9056510514216614, + "grad_norm": 8.81514835357666, + "learning_rate": 1.5724824763056444e-06, + "loss": 2.5886, + "step": 9347000 + }, + { + "epoch": 2.9058064837021482, + "grad_norm": 11.459412574768066, + "learning_rate": 1.5698919382975296e-06, + "loss": 2.569, + "step": 9347500 + }, + { + "epoch": 2.905961915982635, + "grad_norm": 10.276298522949219, + "learning_rate": 1.567301400289415e-06, + "loss": 2.5673, + "step": 9348000 + }, + { + "epoch": 2.906117348263122, + "grad_norm": 9.46594524383545, + "learning_rate": 1.5647108622813003e-06, + "loss": 2.5715, + "step": 9348500 + }, + { + "epoch": 2.906272780543609, + "grad_norm": 9.957636833190918, + "learning_rate": 1.5621203242731858e-06, + "loss": 2.5617, + "step": 9349000 + }, + { + "epoch": 2.9064282128240957, + "grad_norm": 40.886375427246094, + "learning_rate": 1.559529786265071e-06, + "loss": 2.5618, + "step": 9349500 + }, + { + "epoch": 2.9065836451045826, + "grad_norm": 17.444766998291016, + "learning_rate": 1.5569392482569567e-06, + "loss": 2.5235, + "step": 9350000 + }, + { + "epoch": 2.9067390773850694, + "grad_norm": 11.971524238586426, + "learning_rate": 1.554348710248842e-06, + "loss": 2.5658, + "step": 9350500 + }, + { + "epoch": 2.9068945096655563, + "grad_norm": 10.530224800109863, + "learning_rate": 1.5517581722407274e-06, + "loss": 2.5351, + "step": 9351000 + }, + { + "epoch": 2.907049941946043, + "grad_norm": 11.016140937805176, + "learning_rate": 1.5491676342326129e-06, + "loss": 2.5275, + "step": 9351500 + }, + { + "epoch": 2.90720537422653, + "grad_norm": 13.419705390930176, + "learning_rate": 1.5465770962244983e-06, + "loss": 2.514, + "step": 9352000 + }, + { + "epoch": 2.907360806507017, + "grad_norm": 15.029202461242676, + "learning_rate": 1.5439865582163836e-06, + "loss": 2.5394, + "step": 9352500 + }, + { + "epoch": 2.9075162387875038, + "grad_norm": 11.830757141113281, + "learning_rate": 1.5413960202082688e-06, + "loss": 2.5659, + "step": 9353000 + }, + { + "epoch": 2.9076716710679906, + "grad_norm": 11.093260765075684, + "learning_rate": 1.5388054822001545e-06, + "loss": 2.5571, + "step": 9353500 + }, + { + "epoch": 2.9078271033484775, + "grad_norm": 14.761646270751953, + "learning_rate": 1.5362149441920397e-06, + "loss": 2.5328, + "step": 9354000 + }, + { + "epoch": 2.9079825356289644, + "grad_norm": 26.916221618652344, + "learning_rate": 1.5336244061839252e-06, + "loss": 2.5559, + "step": 9354500 + }, + { + "epoch": 2.9081379679094512, + "grad_norm": 11.14825439453125, + "learning_rate": 1.5310338681758104e-06, + "loss": 2.5384, + "step": 9355000 + }, + { + "epoch": 2.908293400189938, + "grad_norm": 8.68564510345459, + "learning_rate": 1.5284433301676961e-06, + "loss": 2.5555, + "step": 9355500 + }, + { + "epoch": 2.908448832470425, + "grad_norm": 8.199522018432617, + "learning_rate": 1.5258527921595814e-06, + "loss": 2.5413, + "step": 9356000 + }, + { + "epoch": 2.9086042647509123, + "grad_norm": 10.353896141052246, + "learning_rate": 1.5232622541514668e-06, + "loss": 2.5628, + "step": 9356500 + }, + { + "epoch": 2.9087596970313987, + "grad_norm": 7.108763217926025, + "learning_rate": 1.520671716143352e-06, + "loss": 2.5924, + "step": 9357000 + }, + { + "epoch": 2.908915129311886, + "grad_norm": 9.584503173828125, + "learning_rate": 1.5180811781352375e-06, + "loss": 2.5637, + "step": 9357500 + }, + { + "epoch": 2.9090705615923724, + "grad_norm": 18.018264770507812, + "learning_rate": 1.515490640127123e-06, + "loss": 2.5499, + "step": 9358000 + }, + { + "epoch": 2.9092259938728597, + "grad_norm": 7.694958209991455, + "learning_rate": 1.5129001021190082e-06, + "loss": 2.5559, + "step": 9358500 + }, + { + "epoch": 2.909381426153346, + "grad_norm": 10.425806045532227, + "learning_rate": 1.5103095641108937e-06, + "loss": 2.5384, + "step": 9359000 + }, + { + "epoch": 2.9095368584338335, + "grad_norm": 9.812454223632812, + "learning_rate": 1.5077190261027792e-06, + "loss": 2.5781, + "step": 9359500 + }, + { + "epoch": 2.90969229071432, + "grad_norm": 11.078044891357422, + "learning_rate": 1.5051284880946646e-06, + "loss": 2.5587, + "step": 9360000 + }, + { + "epoch": 2.909847722994807, + "grad_norm": 11.284839630126953, + "learning_rate": 1.5025379500865499e-06, + "loss": 2.5549, + "step": 9360500 + }, + { + "epoch": 2.9100031552752936, + "grad_norm": 10.2720308303833, + "learning_rate": 1.4999474120784353e-06, + "loss": 2.558, + "step": 9361000 + }, + { + "epoch": 2.910158587555781, + "grad_norm": 11.08910846710205, + "learning_rate": 1.4973568740703208e-06, + "loss": 2.5386, + "step": 9361500 + }, + { + "epoch": 2.9103140198362674, + "grad_norm": 60.537113189697266, + "learning_rate": 1.494766336062206e-06, + "loss": 2.5674, + "step": 9362000 + }, + { + "epoch": 2.9104694521167547, + "grad_norm": 11.686759948730469, + "learning_rate": 1.4921757980540915e-06, + "loss": 2.5782, + "step": 9362500 + }, + { + "epoch": 2.9106248843972415, + "grad_norm": 10.060784339904785, + "learning_rate": 1.489585260045977e-06, + "loss": 2.5628, + "step": 9363000 + }, + { + "epoch": 2.9107803166777284, + "grad_norm": 10.198882102966309, + "learning_rate": 1.4869947220378624e-06, + "loss": 2.5576, + "step": 9363500 + }, + { + "epoch": 2.9109357489582153, + "grad_norm": 9.506739616394043, + "learning_rate": 1.4844041840297477e-06, + "loss": 2.5584, + "step": 9364000 + }, + { + "epoch": 2.911091181238702, + "grad_norm": 10.046979904174805, + "learning_rate": 1.4818136460216331e-06, + "loss": 2.5248, + "step": 9364500 + }, + { + "epoch": 2.911246613519189, + "grad_norm": 15.763891220092773, + "learning_rate": 1.4792231080135186e-06, + "loss": 2.5325, + "step": 9365000 + }, + { + "epoch": 2.911402045799676, + "grad_norm": 10.10346794128418, + "learning_rate": 1.476632570005404e-06, + "loss": 2.5409, + "step": 9365500 + }, + { + "epoch": 2.9115574780801627, + "grad_norm": 11.466666221618652, + "learning_rate": 1.4740420319972893e-06, + "loss": 2.568, + "step": 9366000 + }, + { + "epoch": 2.9117129103606496, + "grad_norm": 9.274223327636719, + "learning_rate": 1.4714514939891747e-06, + "loss": 2.5862, + "step": 9366500 + }, + { + "epoch": 2.9118683426411365, + "grad_norm": 7.858643054962158, + "learning_rate": 1.4688609559810602e-06, + "loss": 2.5491, + "step": 9367000 + }, + { + "epoch": 2.9120237749216233, + "grad_norm": 13.005814552307129, + "learning_rate": 1.4662704179729454e-06, + "loss": 2.5965, + "step": 9367500 + }, + { + "epoch": 2.91217920720211, + "grad_norm": 10.399495124816895, + "learning_rate": 1.463679879964831e-06, + "loss": 2.5219, + "step": 9368000 + }, + { + "epoch": 2.912334639482597, + "grad_norm": 10.581647872924805, + "learning_rate": 1.4610893419567164e-06, + "loss": 2.5381, + "step": 9368500 + }, + { + "epoch": 2.912490071763084, + "grad_norm": 8.709013938903809, + "learning_rate": 1.4584988039486018e-06, + "loss": 2.5111, + "step": 9369000 + }, + { + "epoch": 2.912645504043571, + "grad_norm": 27.252002716064453, + "learning_rate": 1.455908265940487e-06, + "loss": 2.5075, + "step": 9369500 + }, + { + "epoch": 2.9128009363240577, + "grad_norm": 8.505922317504883, + "learning_rate": 1.4533177279323725e-06, + "loss": 2.5501, + "step": 9370000 + }, + { + "epoch": 2.9129563686045445, + "grad_norm": 25.861061096191406, + "learning_rate": 1.450727189924258e-06, + "loss": 2.5926, + "step": 9370500 + }, + { + "epoch": 2.9131118008850314, + "grad_norm": 10.436534881591797, + "learning_rate": 1.4481366519161434e-06, + "loss": 2.5452, + "step": 9371000 + }, + { + "epoch": 2.9132672331655183, + "grad_norm": 11.91588020324707, + "learning_rate": 1.4455461139080287e-06, + "loss": 2.5717, + "step": 9371500 + }, + { + "epoch": 2.913422665446005, + "grad_norm": 9.644073486328125, + "learning_rate": 1.442955575899914e-06, + "loss": 2.5709, + "step": 9372000 + }, + { + "epoch": 2.913578097726492, + "grad_norm": 10.423542022705078, + "learning_rate": 1.4403650378917996e-06, + "loss": 2.5543, + "step": 9372500 + }, + { + "epoch": 2.913733530006979, + "grad_norm": 8.563035011291504, + "learning_rate": 1.4377744998836849e-06, + "loss": 2.5573, + "step": 9373000 + }, + { + "epoch": 2.9138889622874657, + "grad_norm": 9.41515827178955, + "learning_rate": 1.4351839618755703e-06, + "loss": 2.59, + "step": 9373500 + }, + { + "epoch": 2.9140443945679526, + "grad_norm": 10.264147758483887, + "learning_rate": 1.4325934238674556e-06, + "loss": 2.532, + "step": 9374000 + }, + { + "epoch": 2.9141998268484395, + "grad_norm": 12.431987762451172, + "learning_rate": 1.4300028858593412e-06, + "loss": 2.5439, + "step": 9374500 + }, + { + "epoch": 2.9143552591289263, + "grad_norm": 14.08554458618164, + "learning_rate": 1.4274123478512265e-06, + "loss": 2.4967, + "step": 9375000 + }, + { + "epoch": 2.914510691409413, + "grad_norm": 12.197053909301758, + "learning_rate": 1.424821809843112e-06, + "loss": 2.5582, + "step": 9375500 + }, + { + "epoch": 2.9146661236899, + "grad_norm": 10.316427230834961, + "learning_rate": 1.4222312718349974e-06, + "loss": 2.6007, + "step": 9376000 + }, + { + "epoch": 2.914821555970387, + "grad_norm": 10.806053161621094, + "learning_rate": 1.4196407338268827e-06, + "loss": 2.5574, + "step": 9376500 + }, + { + "epoch": 2.914976988250874, + "grad_norm": 17.310293197631836, + "learning_rate": 1.4170501958187681e-06, + "loss": 2.5571, + "step": 9377000 + }, + { + "epoch": 2.9151324205313607, + "grad_norm": 8.934252738952637, + "learning_rate": 1.4144596578106534e-06, + "loss": 2.5562, + "step": 9377500 + }, + { + "epoch": 2.9152878528118475, + "grad_norm": 10.803428649902344, + "learning_rate": 1.411869119802539e-06, + "loss": 2.5191, + "step": 9378000 + }, + { + "epoch": 2.9154432850923344, + "grad_norm": 16.18064308166504, + "learning_rate": 1.4092785817944243e-06, + "loss": 2.5291, + "step": 9378500 + }, + { + "epoch": 2.9155987173728213, + "grad_norm": 29.7318115234375, + "learning_rate": 1.4066880437863097e-06, + "loss": 2.5684, + "step": 9379000 + }, + { + "epoch": 2.915754149653308, + "grad_norm": 20.27215576171875, + "learning_rate": 1.404097505778195e-06, + "loss": 2.5597, + "step": 9379500 + }, + { + "epoch": 2.915909581933795, + "grad_norm": 9.885123252868652, + "learning_rate": 1.4015069677700807e-06, + "loss": 2.526, + "step": 9380000 + }, + { + "epoch": 2.916065014214282, + "grad_norm": 10.711761474609375, + "learning_rate": 1.398916429761966e-06, + "loss": 2.5771, + "step": 9380500 + }, + { + "epoch": 2.916220446494769, + "grad_norm": 9.89456558227539, + "learning_rate": 1.3963258917538512e-06, + "loss": 2.5277, + "step": 9381000 + }, + { + "epoch": 2.9163758787752556, + "grad_norm": 19.211057662963867, + "learning_rate": 1.3937353537457366e-06, + "loss": 2.6222, + "step": 9381500 + }, + { + "epoch": 2.916531311055743, + "grad_norm": 9.639769554138184, + "learning_rate": 1.391144815737622e-06, + "loss": 2.5631, + "step": 9382000 + }, + { + "epoch": 2.9166867433362293, + "grad_norm": 10.159972190856934, + "learning_rate": 1.3885542777295075e-06, + "loss": 2.5228, + "step": 9382500 + }, + { + "epoch": 2.9168421756167167, + "grad_norm": 7.706073760986328, + "learning_rate": 1.3859637397213928e-06, + "loss": 2.5191, + "step": 9383000 + }, + { + "epoch": 2.916997607897203, + "grad_norm": 12.534255981445312, + "learning_rate": 1.3833732017132782e-06, + "loss": 2.5338, + "step": 9383500 + }, + { + "epoch": 2.9171530401776904, + "grad_norm": 32.82538986206055, + "learning_rate": 1.3807826637051637e-06, + "loss": 2.5575, + "step": 9384000 + }, + { + "epoch": 2.917308472458177, + "grad_norm": 10.373442649841309, + "learning_rate": 1.3781921256970492e-06, + "loss": 2.5492, + "step": 9384500 + }, + { + "epoch": 2.917463904738664, + "grad_norm": 9.158356666564941, + "learning_rate": 1.3756015876889344e-06, + "loss": 2.5425, + "step": 9385000 + }, + { + "epoch": 2.9176193370191505, + "grad_norm": 14.653316497802734, + "learning_rate": 1.3730110496808199e-06, + "loss": 2.5557, + "step": 9385500 + }, + { + "epoch": 2.917774769299638, + "grad_norm": 8.18665885925293, + "learning_rate": 1.3704205116727053e-06, + "loss": 2.5363, + "step": 9386000 + }, + { + "epoch": 2.9179302015801247, + "grad_norm": 8.66610050201416, + "learning_rate": 1.3678299736645906e-06, + "loss": 2.5111, + "step": 9386500 + }, + { + "epoch": 2.9180856338606116, + "grad_norm": 12.602442741394043, + "learning_rate": 1.365239435656476e-06, + "loss": 2.5495, + "step": 9387000 + }, + { + "epoch": 2.9182410661410985, + "grad_norm": 10.041719436645508, + "learning_rate": 1.3626488976483615e-06, + "loss": 2.5164, + "step": 9387500 + }, + { + "epoch": 2.9183964984215853, + "grad_norm": 10.743947982788086, + "learning_rate": 1.360058359640247e-06, + "loss": 2.58, + "step": 9388000 + }, + { + "epoch": 2.918551930702072, + "grad_norm": 7.923871040344238, + "learning_rate": 1.3574678216321322e-06, + "loss": 2.5224, + "step": 9388500 + }, + { + "epoch": 2.918707362982559, + "grad_norm": 10.971399307250977, + "learning_rate": 1.3548772836240177e-06, + "loss": 2.6034, + "step": 9389000 + }, + { + "epoch": 2.918862795263046, + "grad_norm": 6.895266532897949, + "learning_rate": 1.3522867456159031e-06, + "loss": 2.5859, + "step": 9389500 + }, + { + "epoch": 2.919018227543533, + "grad_norm": 7.746025562286377, + "learning_rate": 1.3496962076077886e-06, + "loss": 2.5454, + "step": 9390000 + }, + { + "epoch": 2.9191736598240197, + "grad_norm": 10.982757568359375, + "learning_rate": 1.3471056695996738e-06, + "loss": 2.5518, + "step": 9390500 + }, + { + "epoch": 2.9193290921045065, + "grad_norm": 12.811943054199219, + "learning_rate": 1.3445151315915593e-06, + "loss": 2.5752, + "step": 9391000 + }, + { + "epoch": 2.9194845243849934, + "grad_norm": 10.420856475830078, + "learning_rate": 1.3419245935834447e-06, + "loss": 2.5488, + "step": 9391500 + }, + { + "epoch": 2.9196399566654803, + "grad_norm": 12.168075561523438, + "learning_rate": 1.33933405557533e-06, + "loss": 2.5801, + "step": 9392000 + }, + { + "epoch": 2.919795388945967, + "grad_norm": 11.097491264343262, + "learning_rate": 1.3367435175672154e-06, + "loss": 2.5543, + "step": 9392500 + }, + { + "epoch": 2.919950821226454, + "grad_norm": 15.400562286376953, + "learning_rate": 1.334152979559101e-06, + "loss": 2.5161, + "step": 9393000 + }, + { + "epoch": 2.920106253506941, + "grad_norm": 9.18416690826416, + "learning_rate": 1.3315624415509864e-06, + "loss": 2.5132, + "step": 9393500 + }, + { + "epoch": 2.9202616857874277, + "grad_norm": 11.55815601348877, + "learning_rate": 1.3289719035428716e-06, + "loss": 2.6112, + "step": 9394000 + }, + { + "epoch": 2.9204171180679146, + "grad_norm": 9.681013107299805, + "learning_rate": 1.326381365534757e-06, + "loss": 2.5444, + "step": 9394500 + }, + { + "epoch": 2.9205725503484015, + "grad_norm": 9.69070053100586, + "learning_rate": 1.3237908275266425e-06, + "loss": 2.553, + "step": 9395000 + }, + { + "epoch": 2.9207279826288883, + "grad_norm": 10.641267776489258, + "learning_rate": 1.3212002895185278e-06, + "loss": 2.5834, + "step": 9395500 + }, + { + "epoch": 2.920883414909375, + "grad_norm": 13.877685546875, + "learning_rate": 1.3186097515104132e-06, + "loss": 2.5935, + "step": 9396000 + }, + { + "epoch": 2.921038847189862, + "grad_norm": 13.624731063842773, + "learning_rate": 1.3160192135022985e-06, + "loss": 2.5368, + "step": 9396500 + }, + { + "epoch": 2.921194279470349, + "grad_norm": 26.1025390625, + "learning_rate": 1.3134286754941842e-06, + "loss": 2.5746, + "step": 9397000 + }, + { + "epoch": 2.921349711750836, + "grad_norm": 9.464317321777344, + "learning_rate": 1.3108381374860694e-06, + "loss": 2.5753, + "step": 9397500 + }, + { + "epoch": 2.9215051440313227, + "grad_norm": 7.850105285644531, + "learning_rate": 1.3082475994779549e-06, + "loss": 2.6075, + "step": 9398000 + }, + { + "epoch": 2.9216605763118095, + "grad_norm": 10.94693660736084, + "learning_rate": 1.3056570614698403e-06, + "loss": 2.5665, + "step": 9398500 + }, + { + "epoch": 2.9218160085922964, + "grad_norm": 13.635025978088379, + "learning_rate": 1.3030665234617258e-06, + "loss": 2.5269, + "step": 9399000 + }, + { + "epoch": 2.9219714408727833, + "grad_norm": 8.545040130615234, + "learning_rate": 1.300475985453611e-06, + "loss": 2.4977, + "step": 9399500 + }, + { + "epoch": 2.92212687315327, + "grad_norm": 10.503376007080078, + "learning_rate": 1.2978854474454963e-06, + "loss": 2.5602, + "step": 9400000 + }, + { + "epoch": 2.922282305433757, + "grad_norm": 11.209722518920898, + "learning_rate": 1.295294909437382e-06, + "loss": 2.5381, + "step": 9400500 + }, + { + "epoch": 2.922437737714244, + "grad_norm": 13.451593399047852, + "learning_rate": 1.2927043714292672e-06, + "loss": 2.5435, + "step": 9401000 + }, + { + "epoch": 2.9225931699947307, + "grad_norm": 11.832096099853516, + "learning_rate": 1.2901138334211527e-06, + "loss": 2.5988, + "step": 9401500 + }, + { + "epoch": 2.9227486022752176, + "grad_norm": 13.26115608215332, + "learning_rate": 1.287523295413038e-06, + "loss": 2.5842, + "step": 9402000 + }, + { + "epoch": 2.9229040345557045, + "grad_norm": 9.702138900756836, + "learning_rate": 1.2849327574049236e-06, + "loss": 2.5666, + "step": 9402500 + }, + { + "epoch": 2.9230594668361913, + "grad_norm": 16.700212478637695, + "learning_rate": 1.2823422193968088e-06, + "loss": 2.5406, + "step": 9403000 + }, + { + "epoch": 2.923214899116678, + "grad_norm": 9.662495613098145, + "learning_rate": 1.2797516813886943e-06, + "loss": 2.5418, + "step": 9403500 + }, + { + "epoch": 2.923370331397165, + "grad_norm": 7.498327255249023, + "learning_rate": 1.2771611433805795e-06, + "loss": 2.5325, + "step": 9404000 + }, + { + "epoch": 2.9235257636776524, + "grad_norm": 10.085265159606934, + "learning_rate": 1.2745706053724652e-06, + "loss": 2.5719, + "step": 9404500 + }, + { + "epoch": 2.923681195958139, + "grad_norm": 10.09762191772461, + "learning_rate": 1.2719800673643504e-06, + "loss": 2.567, + "step": 9405000 + }, + { + "epoch": 2.923836628238626, + "grad_norm": 12.914572715759277, + "learning_rate": 1.2693895293562357e-06, + "loss": 2.5378, + "step": 9405500 + }, + { + "epoch": 2.9239920605191125, + "grad_norm": 10.888591766357422, + "learning_rate": 1.2667989913481212e-06, + "loss": 2.5294, + "step": 9406000 + }, + { + "epoch": 2.9241474927996, + "grad_norm": 9.535743713378906, + "learning_rate": 1.2642084533400066e-06, + "loss": 2.5662, + "step": 9406500 + }, + { + "epoch": 2.9243029250800863, + "grad_norm": 12.011346817016602, + "learning_rate": 1.261617915331892e-06, + "loss": 2.5538, + "step": 9407000 + }, + { + "epoch": 2.9244583573605736, + "grad_norm": 55.002349853515625, + "learning_rate": 1.2590273773237773e-06, + "loss": 2.5245, + "step": 9407500 + }, + { + "epoch": 2.92461378964106, + "grad_norm": 51.01461410522461, + "learning_rate": 1.2564368393156628e-06, + "loss": 2.5786, + "step": 9408000 + }, + { + "epoch": 2.9247692219215473, + "grad_norm": 9.367362022399902, + "learning_rate": 1.2538463013075482e-06, + "loss": 2.5165, + "step": 9408500 + }, + { + "epoch": 2.9249246542020337, + "grad_norm": 9.879948616027832, + "learning_rate": 1.2512557632994337e-06, + "loss": 2.527, + "step": 9409000 + }, + { + "epoch": 2.925080086482521, + "grad_norm": 8.670969009399414, + "learning_rate": 1.248665225291319e-06, + "loss": 2.5453, + "step": 9409500 + }, + { + "epoch": 2.9252355187630075, + "grad_norm": 10.208674430847168, + "learning_rate": 1.2460746872832044e-06, + "loss": 2.5521, + "step": 9410000 + }, + { + "epoch": 2.925390951043495, + "grad_norm": 8.18177604675293, + "learning_rate": 1.2434841492750899e-06, + "loss": 2.5212, + "step": 9410500 + }, + { + "epoch": 2.9255463833239816, + "grad_norm": 8.864314079284668, + "learning_rate": 1.2408936112669751e-06, + "loss": 2.5418, + "step": 9411000 + }, + { + "epoch": 2.9257018156044685, + "grad_norm": 10.6257963180542, + "learning_rate": 1.2383030732588606e-06, + "loss": 2.5191, + "step": 9411500 + }, + { + "epoch": 2.9258572478849554, + "grad_norm": 10.479496955871582, + "learning_rate": 1.235712535250746e-06, + "loss": 2.5787, + "step": 9412000 + }, + { + "epoch": 2.9260126801654422, + "grad_norm": 18.737071990966797, + "learning_rate": 1.2331219972426315e-06, + "loss": 2.5737, + "step": 9412500 + }, + { + "epoch": 2.926168112445929, + "grad_norm": 9.517501831054688, + "learning_rate": 1.2305314592345167e-06, + "loss": 2.5524, + "step": 9413000 + }, + { + "epoch": 2.926323544726416, + "grad_norm": 8.597837448120117, + "learning_rate": 1.2279409212264022e-06, + "loss": 2.5713, + "step": 9413500 + }, + { + "epoch": 2.926478977006903, + "grad_norm": 9.081242561340332, + "learning_rate": 1.2253503832182877e-06, + "loss": 2.4968, + "step": 9414000 + }, + { + "epoch": 2.9266344092873897, + "grad_norm": 23.266443252563477, + "learning_rate": 1.222759845210173e-06, + "loss": 2.5731, + "step": 9414500 + }, + { + "epoch": 2.9267898415678766, + "grad_norm": 11.383261680603027, + "learning_rate": 1.2201693072020584e-06, + "loss": 2.5102, + "step": 9415000 + }, + { + "epoch": 2.9269452738483634, + "grad_norm": 15.2766752243042, + "learning_rate": 1.2175787691939438e-06, + "loss": 2.5524, + "step": 9415500 + }, + { + "epoch": 2.9271007061288503, + "grad_norm": 8.933450698852539, + "learning_rate": 1.2149882311858293e-06, + "loss": 2.5244, + "step": 9416000 + }, + { + "epoch": 2.927256138409337, + "grad_norm": 10.113451957702637, + "learning_rate": 1.2123976931777145e-06, + "loss": 2.591, + "step": 9416500 + }, + { + "epoch": 2.927411570689824, + "grad_norm": 16.050153732299805, + "learning_rate": 1.2098071551696e-06, + "loss": 2.5501, + "step": 9417000 + }, + { + "epoch": 2.927567002970311, + "grad_norm": 8.96247673034668, + "learning_rate": 1.2072166171614854e-06, + "loss": 2.5054, + "step": 9417500 + }, + { + "epoch": 2.927722435250798, + "grad_norm": 10.409266471862793, + "learning_rate": 1.204626079153371e-06, + "loss": 2.5657, + "step": 9418000 + }, + { + "epoch": 2.9278778675312847, + "grad_norm": 9.934226989746094, + "learning_rate": 1.2020355411452562e-06, + "loss": 2.5436, + "step": 9418500 + }, + { + "epoch": 2.9280332998117715, + "grad_norm": 9.145532608032227, + "learning_rate": 1.1994450031371416e-06, + "loss": 2.5998, + "step": 9419000 + }, + { + "epoch": 2.9281887320922584, + "grad_norm": 10.955909729003906, + "learning_rate": 1.196854465129027e-06, + "loss": 2.5638, + "step": 9419500 + }, + { + "epoch": 2.9283441643727453, + "grad_norm": 26.963672637939453, + "learning_rate": 1.1942639271209123e-06, + "loss": 2.5035, + "step": 9420000 + }, + { + "epoch": 2.928499596653232, + "grad_norm": 17.977542877197266, + "learning_rate": 1.1916733891127978e-06, + "loss": 2.5298, + "step": 9420500 + }, + { + "epoch": 2.928655028933719, + "grad_norm": 15.292874336242676, + "learning_rate": 1.189082851104683e-06, + "loss": 2.5027, + "step": 9421000 + }, + { + "epoch": 2.928810461214206, + "grad_norm": 10.42463207244873, + "learning_rate": 1.1864923130965687e-06, + "loss": 2.5555, + "step": 9421500 + }, + { + "epoch": 2.9289658934946927, + "grad_norm": 13.56664752960205, + "learning_rate": 1.183901775088454e-06, + "loss": 2.5696, + "step": 9422000 + }, + { + "epoch": 2.9291213257751796, + "grad_norm": 9.366950988769531, + "learning_rate": 1.1813112370803394e-06, + "loss": 2.5605, + "step": 9422500 + }, + { + "epoch": 2.9292767580556665, + "grad_norm": 9.712236404418945, + "learning_rate": 1.1787206990722249e-06, + "loss": 2.4902, + "step": 9423000 + }, + { + "epoch": 2.9294321903361533, + "grad_norm": 27.553466796875, + "learning_rate": 1.1761301610641103e-06, + "loss": 2.5875, + "step": 9423500 + }, + { + "epoch": 2.92958762261664, + "grad_norm": 9.644946098327637, + "learning_rate": 1.1735396230559956e-06, + "loss": 2.5444, + "step": 9424000 + }, + { + "epoch": 2.929743054897127, + "grad_norm": 6.995643138885498, + "learning_rate": 1.1709490850478808e-06, + "loss": 2.5881, + "step": 9424500 + }, + { + "epoch": 2.929898487177614, + "grad_norm": 11.17707347869873, + "learning_rate": 1.1683585470397665e-06, + "loss": 2.5276, + "step": 9425000 + }, + { + "epoch": 2.930053919458101, + "grad_norm": 9.960949897766113, + "learning_rate": 1.1657680090316517e-06, + "loss": 2.5687, + "step": 9425500 + }, + { + "epoch": 2.9302093517385877, + "grad_norm": 10.848793029785156, + "learning_rate": 1.1631774710235372e-06, + "loss": 2.5369, + "step": 9426000 + }, + { + "epoch": 2.9303647840190745, + "grad_norm": 44.24026107788086, + "learning_rate": 1.1605869330154224e-06, + "loss": 2.5878, + "step": 9426500 + }, + { + "epoch": 2.9305202162995614, + "grad_norm": 9.258546829223633, + "learning_rate": 1.1579963950073081e-06, + "loss": 2.5453, + "step": 9427000 + }, + { + "epoch": 2.9306756485800483, + "grad_norm": 9.004166603088379, + "learning_rate": 1.1554058569991934e-06, + "loss": 2.5532, + "step": 9427500 + }, + { + "epoch": 2.930831080860535, + "grad_norm": 10.269296646118164, + "learning_rate": 1.1528153189910788e-06, + "loss": 2.5199, + "step": 9428000 + }, + { + "epoch": 2.930986513141022, + "grad_norm": 9.08797550201416, + "learning_rate": 1.150224780982964e-06, + "loss": 2.5878, + "step": 9428500 + }, + { + "epoch": 2.9311419454215093, + "grad_norm": 22.311983108520508, + "learning_rate": 1.1476342429748495e-06, + "loss": 2.5488, + "step": 9429000 + }, + { + "epoch": 2.9312973777019957, + "grad_norm": 10.635948181152344, + "learning_rate": 1.145043704966735e-06, + "loss": 2.5799, + "step": 9429500 + }, + { + "epoch": 2.931452809982483, + "grad_norm": 8.638033866882324, + "learning_rate": 1.1424531669586202e-06, + "loss": 2.5339, + "step": 9430000 + }, + { + "epoch": 2.9316082422629695, + "grad_norm": 10.532668113708496, + "learning_rate": 1.1398626289505057e-06, + "loss": 2.5226, + "step": 9430500 + }, + { + "epoch": 2.9317636745434568, + "grad_norm": 11.113556861877441, + "learning_rate": 1.1372720909423912e-06, + "loss": 2.5127, + "step": 9431000 + }, + { + "epoch": 2.931919106823943, + "grad_norm": 10.11831283569336, + "learning_rate": 1.1346815529342766e-06, + "loss": 2.5457, + "step": 9431500 + }, + { + "epoch": 2.9320745391044305, + "grad_norm": 9.6809720993042, + "learning_rate": 1.1320910149261619e-06, + "loss": 2.5916, + "step": 9432000 + }, + { + "epoch": 2.932229971384917, + "grad_norm": 12.910767555236816, + "learning_rate": 1.1295004769180473e-06, + "loss": 2.5616, + "step": 9432500 + }, + { + "epoch": 2.9323854036654042, + "grad_norm": 10.323822975158691, + "learning_rate": 1.1269099389099328e-06, + "loss": 2.5558, + "step": 9433000 + }, + { + "epoch": 2.9325408359458907, + "grad_norm": 10.450052261352539, + "learning_rate": 1.124319400901818e-06, + "loss": 2.518, + "step": 9433500 + }, + { + "epoch": 2.932696268226378, + "grad_norm": 8.999446868896484, + "learning_rate": 1.1217288628937035e-06, + "loss": 2.5302, + "step": 9434000 + }, + { + "epoch": 2.9328517005068644, + "grad_norm": 11.000748634338379, + "learning_rate": 1.119138324885589e-06, + "loss": 2.5446, + "step": 9434500 + }, + { + "epoch": 2.9330071327873517, + "grad_norm": 11.744410514831543, + "learning_rate": 1.1165477868774744e-06, + "loss": 2.519, + "step": 9435000 + }, + { + "epoch": 2.9331625650678386, + "grad_norm": 10.219304084777832, + "learning_rate": 1.1139572488693597e-06, + "loss": 2.5468, + "step": 9435500 + }, + { + "epoch": 2.9333179973483254, + "grad_norm": 12.345999717712402, + "learning_rate": 1.1113667108612451e-06, + "loss": 2.5336, + "step": 9436000 + }, + { + "epoch": 2.9334734296288123, + "grad_norm": 9.922389030456543, + "learning_rate": 1.1087761728531306e-06, + "loss": 2.5186, + "step": 9436500 + }, + { + "epoch": 2.933628861909299, + "grad_norm": 11.41718864440918, + "learning_rate": 1.106185634845016e-06, + "loss": 2.5754, + "step": 9437000 + }, + { + "epoch": 2.933784294189786, + "grad_norm": 10.01362133026123, + "learning_rate": 1.1035950968369013e-06, + "loss": 2.5604, + "step": 9437500 + }, + { + "epoch": 2.933939726470273, + "grad_norm": 7.66773796081543, + "learning_rate": 1.1010045588287867e-06, + "loss": 2.5306, + "step": 9438000 + }, + { + "epoch": 2.9340951587507598, + "grad_norm": 9.039938926696777, + "learning_rate": 1.0984140208206722e-06, + "loss": 2.54, + "step": 9438500 + }, + { + "epoch": 2.9342505910312466, + "grad_norm": 13.069718360900879, + "learning_rate": 1.0958234828125574e-06, + "loss": 2.5145, + "step": 9439000 + }, + { + "epoch": 2.9344060233117335, + "grad_norm": 10.315326690673828, + "learning_rate": 1.093232944804443e-06, + "loss": 2.5359, + "step": 9439500 + }, + { + "epoch": 2.9345614555922204, + "grad_norm": 12.348262786865234, + "learning_rate": 1.0906424067963284e-06, + "loss": 2.5082, + "step": 9440000 + }, + { + "epoch": 2.9347168878727072, + "grad_norm": 8.945372581481934, + "learning_rate": 1.0880518687882138e-06, + "loss": 2.5683, + "step": 9440500 + }, + { + "epoch": 2.934872320153194, + "grad_norm": 9.931254386901855, + "learning_rate": 1.085461330780099e-06, + "loss": 2.5194, + "step": 9441000 + }, + { + "epoch": 2.935027752433681, + "grad_norm": 8.225577354431152, + "learning_rate": 1.0828707927719845e-06, + "loss": 2.5675, + "step": 9441500 + }, + { + "epoch": 2.935183184714168, + "grad_norm": 8.163716316223145, + "learning_rate": 1.08028025476387e-06, + "loss": 2.5473, + "step": 9442000 + }, + { + "epoch": 2.9353386169946547, + "grad_norm": 9.18761920928955, + "learning_rate": 1.0776897167557554e-06, + "loss": 2.5364, + "step": 9442500 + }, + { + "epoch": 2.9354940492751416, + "grad_norm": 8.267258644104004, + "learning_rate": 1.0750991787476407e-06, + "loss": 2.5403, + "step": 9443000 + }, + { + "epoch": 2.9356494815556284, + "grad_norm": 9.145506858825684, + "learning_rate": 1.072508640739526e-06, + "loss": 2.5294, + "step": 9443500 + }, + { + "epoch": 2.9358049138361153, + "grad_norm": 8.969868659973145, + "learning_rate": 1.0699181027314116e-06, + "loss": 2.5951, + "step": 9444000 + }, + { + "epoch": 2.935960346116602, + "grad_norm": 8.96842098236084, + "learning_rate": 1.0673275647232969e-06, + "loss": 2.5316, + "step": 9444500 + }, + { + "epoch": 2.936115778397089, + "grad_norm": 9.223522186279297, + "learning_rate": 1.0647370267151823e-06, + "loss": 2.5209, + "step": 9445000 + }, + { + "epoch": 2.936271210677576, + "grad_norm": 15.361015319824219, + "learning_rate": 1.0621464887070676e-06, + "loss": 2.5451, + "step": 9445500 + }, + { + "epoch": 2.9364266429580628, + "grad_norm": 10.822629928588867, + "learning_rate": 1.0595559506989532e-06, + "loss": 2.518, + "step": 9446000 + }, + { + "epoch": 2.9365820752385496, + "grad_norm": 10.599461555480957, + "learning_rate": 1.0569654126908385e-06, + "loss": 2.5302, + "step": 9446500 + }, + { + "epoch": 2.9367375075190365, + "grad_norm": 21.351850509643555, + "learning_rate": 1.054374874682724e-06, + "loss": 2.5489, + "step": 9447000 + }, + { + "epoch": 2.9368929397995234, + "grad_norm": 9.228177070617676, + "learning_rate": 1.0517843366746094e-06, + "loss": 2.5107, + "step": 9447500 + }, + { + "epoch": 2.9370483720800102, + "grad_norm": 16.218246459960938, + "learning_rate": 1.0491937986664947e-06, + "loss": 2.4958, + "step": 9448000 + }, + { + "epoch": 2.937203804360497, + "grad_norm": 11.452936172485352, + "learning_rate": 1.0466032606583801e-06, + "loss": 2.5414, + "step": 9448500 + }, + { + "epoch": 2.937359236640984, + "grad_norm": 9.074593544006348, + "learning_rate": 1.0440127226502654e-06, + "loss": 2.5687, + "step": 9449000 + }, + { + "epoch": 2.937514668921471, + "grad_norm": 8.058931350708008, + "learning_rate": 1.041422184642151e-06, + "loss": 2.5882, + "step": 9449500 + }, + { + "epoch": 2.9376701012019577, + "grad_norm": 19.75777244567871, + "learning_rate": 1.0388316466340363e-06, + "loss": 2.5139, + "step": 9450000 + }, + { + "epoch": 2.9378255334824446, + "grad_norm": 10.58510971069336, + "learning_rate": 1.0362411086259217e-06, + "loss": 2.5732, + "step": 9450500 + }, + { + "epoch": 2.9379809657629314, + "grad_norm": 10.922252655029297, + "learning_rate": 1.033650570617807e-06, + "loss": 2.5034, + "step": 9451000 + }, + { + "epoch": 2.9381363980434183, + "grad_norm": 10.1216459274292, + "learning_rate": 1.0310600326096927e-06, + "loss": 2.5367, + "step": 9451500 + }, + { + "epoch": 2.938291830323905, + "grad_norm": 17.786951065063477, + "learning_rate": 1.028469494601578e-06, + "loss": 2.4896, + "step": 9452000 + }, + { + "epoch": 2.938447262604392, + "grad_norm": 118.4488754272461, + "learning_rate": 1.0258789565934634e-06, + "loss": 2.5203, + "step": 9452500 + }, + { + "epoch": 2.938602694884879, + "grad_norm": 9.421546936035156, + "learning_rate": 1.0232884185853486e-06, + "loss": 2.5153, + "step": 9453000 + }, + { + "epoch": 2.938758127165366, + "grad_norm": 8.939818382263184, + "learning_rate": 1.020697880577234e-06, + "loss": 2.5763, + "step": 9453500 + }, + { + "epoch": 2.9389135594458526, + "grad_norm": 9.993494987487793, + "learning_rate": 1.0181073425691195e-06, + "loss": 2.5643, + "step": 9454000 + }, + { + "epoch": 2.93906899172634, + "grad_norm": 22.599197387695312, + "learning_rate": 1.0155168045610048e-06, + "loss": 2.549, + "step": 9454500 + }, + { + "epoch": 2.9392244240068264, + "grad_norm": 10.059784889221191, + "learning_rate": 1.0129262665528902e-06, + "loss": 2.5403, + "step": 9455000 + }, + { + "epoch": 2.9393798562873137, + "grad_norm": 12.007253646850586, + "learning_rate": 1.0103357285447757e-06, + "loss": 2.4864, + "step": 9455500 + }, + { + "epoch": 2.9395352885678, + "grad_norm": 8.264985084533691, + "learning_rate": 1.0077451905366612e-06, + "loss": 2.5535, + "step": 9456000 + }, + { + "epoch": 2.9396907208482874, + "grad_norm": 56.84333801269531, + "learning_rate": 1.0051546525285464e-06, + "loss": 2.5071, + "step": 9456500 + }, + { + "epoch": 2.939846153128774, + "grad_norm": 10.841495513916016, + "learning_rate": 1.0025641145204319e-06, + "loss": 2.5665, + "step": 9457000 + }, + { + "epoch": 2.940001585409261, + "grad_norm": 10.58409309387207, + "learning_rate": 9.999735765123173e-07, + "loss": 2.574, + "step": 9457500 + }, + { + "epoch": 2.9401570176897476, + "grad_norm": 9.92061996459961, + "learning_rate": 9.973830385042026e-07, + "loss": 2.5315, + "step": 9458000 + }, + { + "epoch": 2.940312449970235, + "grad_norm": 14.53640079498291, + "learning_rate": 9.94792500496088e-07, + "loss": 2.5931, + "step": 9458500 + }, + { + "epoch": 2.9404678822507218, + "grad_norm": 10.36019229888916, + "learning_rate": 9.922019624879735e-07, + "loss": 2.5678, + "step": 9459000 + }, + { + "epoch": 2.9406233145312086, + "grad_norm": 34.519344329833984, + "learning_rate": 9.89611424479859e-07, + "loss": 2.5436, + "step": 9459500 + }, + { + "epoch": 2.9407787468116955, + "grad_norm": 11.211426734924316, + "learning_rate": 9.870208864717442e-07, + "loss": 2.5737, + "step": 9460000 + }, + { + "epoch": 2.9409341790921824, + "grad_norm": 11.174949645996094, + "learning_rate": 9.844303484636297e-07, + "loss": 2.5319, + "step": 9460500 + }, + { + "epoch": 2.941089611372669, + "grad_norm": 10.140610694885254, + "learning_rate": 9.818398104555151e-07, + "loss": 2.5304, + "step": 9461000 + }, + { + "epoch": 2.941245043653156, + "grad_norm": 10.161417007446289, + "learning_rate": 9.792492724474006e-07, + "loss": 2.5338, + "step": 9461500 + }, + { + "epoch": 2.941400475933643, + "grad_norm": 11.125310897827148, + "learning_rate": 9.766587344392858e-07, + "loss": 2.6041, + "step": 9462000 + }, + { + "epoch": 2.94155590821413, + "grad_norm": 29.490097045898438, + "learning_rate": 9.740681964311713e-07, + "loss": 2.592, + "step": 9462500 + }, + { + "epoch": 2.9417113404946167, + "grad_norm": 9.80103588104248, + "learning_rate": 9.714776584230567e-07, + "loss": 2.5787, + "step": 9463000 + }, + { + "epoch": 2.9418667727751036, + "grad_norm": 9.73939323425293, + "learning_rate": 9.68887120414942e-07, + "loss": 2.5235, + "step": 9463500 + }, + { + "epoch": 2.9420222050555904, + "grad_norm": 34.92441940307617, + "learning_rate": 9.662965824068274e-07, + "loss": 2.5573, + "step": 9464000 + }, + { + "epoch": 2.9421776373360773, + "grad_norm": 8.827502250671387, + "learning_rate": 9.63706044398713e-07, + "loss": 2.5695, + "step": 9464500 + }, + { + "epoch": 2.942333069616564, + "grad_norm": 6.695847034454346, + "learning_rate": 9.611155063905984e-07, + "loss": 2.5583, + "step": 9465000 + }, + { + "epoch": 2.942488501897051, + "grad_norm": 9.615049362182617, + "learning_rate": 9.585249683824836e-07, + "loss": 2.514, + "step": 9465500 + }, + { + "epoch": 2.942643934177538, + "grad_norm": 12.841774940490723, + "learning_rate": 9.55934430374369e-07, + "loss": 2.5383, + "step": 9466000 + }, + { + "epoch": 2.9427993664580248, + "grad_norm": 13.415742874145508, + "learning_rate": 9.533438923662544e-07, + "loss": 2.5479, + "step": 9466500 + }, + { + "epoch": 2.9429547987385116, + "grad_norm": 31.759563446044922, + "learning_rate": 9.507533543581398e-07, + "loss": 2.5482, + "step": 9467000 + }, + { + "epoch": 2.9431102310189985, + "grad_norm": 15.976899147033691, + "learning_rate": 9.481628163500252e-07, + "loss": 2.517, + "step": 9467500 + }, + { + "epoch": 2.9432656632994854, + "grad_norm": 9.367329597473145, + "learning_rate": 9.455722783419106e-07, + "loss": 2.6215, + "step": 9468000 + }, + { + "epoch": 2.9434210955799722, + "grad_norm": 11.88831901550293, + "learning_rate": 9.42981740333796e-07, + "loss": 2.5948, + "step": 9468500 + }, + { + "epoch": 2.943576527860459, + "grad_norm": 14.989105224609375, + "learning_rate": 9.403912023256814e-07, + "loss": 2.5253, + "step": 9469000 + }, + { + "epoch": 2.943731960140946, + "grad_norm": 10.833976745605469, + "learning_rate": 9.378006643175669e-07, + "loss": 2.5478, + "step": 9469500 + }, + { + "epoch": 2.943887392421433, + "grad_norm": 14.078373908996582, + "learning_rate": 9.352101263094522e-07, + "loss": 2.5045, + "step": 9470000 + }, + { + "epoch": 2.9440428247019197, + "grad_norm": 10.897651672363281, + "learning_rate": 9.326195883013377e-07, + "loss": 2.5228, + "step": 9470500 + }, + { + "epoch": 2.9441982569824066, + "grad_norm": 8.634085655212402, + "learning_rate": 9.30029050293223e-07, + "loss": 2.5162, + "step": 9471000 + }, + { + "epoch": 2.9443536892628934, + "grad_norm": 8.809225082397461, + "learning_rate": 9.274385122851085e-07, + "loss": 2.543, + "step": 9471500 + }, + { + "epoch": 2.9445091215433803, + "grad_norm": 11.214858055114746, + "learning_rate": 9.248479742769938e-07, + "loss": 2.533, + "step": 9472000 + }, + { + "epoch": 2.944664553823867, + "grad_norm": 11.180342674255371, + "learning_rate": 9.222574362688792e-07, + "loss": 2.6195, + "step": 9472500 + }, + { + "epoch": 2.944819986104354, + "grad_norm": 9.128416061401367, + "learning_rate": 9.196668982607647e-07, + "loss": 2.5444, + "step": 9473000 + }, + { + "epoch": 2.944975418384841, + "grad_norm": 9.582271575927734, + "learning_rate": 9.1707636025265e-07, + "loss": 2.5346, + "step": 9473500 + }, + { + "epoch": 2.9451308506653278, + "grad_norm": 11.855810165405273, + "learning_rate": 9.144858222445355e-07, + "loss": 2.5671, + "step": 9474000 + }, + { + "epoch": 2.9452862829458146, + "grad_norm": 12.932989120483398, + "learning_rate": 9.118952842364208e-07, + "loss": 2.5955, + "step": 9474500 + }, + { + "epoch": 2.9454417152263015, + "grad_norm": 9.952001571655273, + "learning_rate": 9.093047462283063e-07, + "loss": 2.5458, + "step": 9475000 + }, + { + "epoch": 2.9455971475067884, + "grad_norm": 15.33908748626709, + "learning_rate": 9.067142082201916e-07, + "loss": 2.5073, + "step": 9475500 + }, + { + "epoch": 2.9457525797872752, + "grad_norm": 9.41450309753418, + "learning_rate": 9.041236702120771e-07, + "loss": 2.5396, + "step": 9476000 + }, + { + "epoch": 2.945908012067762, + "grad_norm": 10.295154571533203, + "learning_rate": 9.015331322039624e-07, + "loss": 2.5471, + "step": 9476500 + }, + { + "epoch": 2.9460634443482494, + "grad_norm": 12.850446701049805, + "learning_rate": 8.989425941958478e-07, + "loss": 2.5185, + "step": 9477000 + }, + { + "epoch": 2.946218876628736, + "grad_norm": 13.914225578308105, + "learning_rate": 8.963520561877333e-07, + "loss": 2.5122, + "step": 9477500 + }, + { + "epoch": 2.946374308909223, + "grad_norm": 12.510498046875, + "learning_rate": 8.937615181796186e-07, + "loss": 2.522, + "step": 9478000 + }, + { + "epoch": 2.9465297411897096, + "grad_norm": 11.508369445800781, + "learning_rate": 8.911709801715041e-07, + "loss": 2.5373, + "step": 9478500 + }, + { + "epoch": 2.946685173470197, + "grad_norm": 11.877148628234863, + "learning_rate": 8.885804421633894e-07, + "loss": 2.5258, + "step": 9479000 + }, + { + "epoch": 2.9468406057506833, + "grad_norm": 23.04665756225586, + "learning_rate": 8.859899041552749e-07, + "loss": 2.5846, + "step": 9479500 + }, + { + "epoch": 2.9469960380311706, + "grad_norm": 11.920206069946289, + "learning_rate": 8.833993661471602e-07, + "loss": 2.5255, + "step": 9480000 + }, + { + "epoch": 2.947151470311657, + "grad_norm": 10.884651184082031, + "learning_rate": 8.808088281390457e-07, + "loss": 2.5605, + "step": 9480500 + }, + { + "epoch": 2.9473069025921443, + "grad_norm": 10.772955894470215, + "learning_rate": 8.78218290130931e-07, + "loss": 2.5828, + "step": 9481000 + }, + { + "epoch": 2.9474623348726308, + "grad_norm": 10.780261039733887, + "learning_rate": 8.756277521228163e-07, + "loss": 2.5421, + "step": 9481500 + }, + { + "epoch": 2.947617767153118, + "grad_norm": 10.345287322998047, + "learning_rate": 8.730372141147019e-07, + "loss": 2.5144, + "step": 9482000 + }, + { + "epoch": 2.9477731994336045, + "grad_norm": 8.7570219039917, + "learning_rate": 8.704466761065871e-07, + "loss": 2.5633, + "step": 9482500 + }, + { + "epoch": 2.947928631714092, + "grad_norm": 8.555648803710938, + "learning_rate": 8.678561380984727e-07, + "loss": 2.5333, + "step": 9483000 + }, + { + "epoch": 2.9480840639945787, + "grad_norm": 9.498480796813965, + "learning_rate": 8.652656000903579e-07, + "loss": 2.5175, + "step": 9483500 + }, + { + "epoch": 2.9482394962750655, + "grad_norm": 21.012969970703125, + "learning_rate": 8.626750620822435e-07, + "loss": 2.5307, + "step": 9484000 + }, + { + "epoch": 2.9483949285555524, + "grad_norm": 11.396516799926758, + "learning_rate": 8.600845240741287e-07, + "loss": 2.5546, + "step": 9484500 + }, + { + "epoch": 2.9485503608360393, + "grad_norm": 13.911910057067871, + "learning_rate": 8.574939860660143e-07, + "loss": 2.5317, + "step": 9485000 + }, + { + "epoch": 2.948705793116526, + "grad_norm": 13.90290355682373, + "learning_rate": 8.549034480578995e-07, + "loss": 2.5444, + "step": 9485500 + }, + { + "epoch": 2.948861225397013, + "grad_norm": 12.873668670654297, + "learning_rate": 8.523129100497851e-07, + "loss": 2.5476, + "step": 9486000 + }, + { + "epoch": 2.9490166576775, + "grad_norm": 11.022047996520996, + "learning_rate": 8.497223720416704e-07, + "loss": 2.5008, + "step": 9486500 + }, + { + "epoch": 2.9491720899579867, + "grad_norm": 10.42230224609375, + "learning_rate": 8.471318340335557e-07, + "loss": 2.5254, + "step": 9487000 + }, + { + "epoch": 2.9493275222384736, + "grad_norm": 11.190484046936035, + "learning_rate": 8.445412960254412e-07, + "loss": 2.5657, + "step": 9487500 + }, + { + "epoch": 2.9494829545189605, + "grad_norm": 24.14895248413086, + "learning_rate": 8.419507580173265e-07, + "loss": 2.6286, + "step": 9488000 + }, + { + "epoch": 2.9496383867994473, + "grad_norm": 9.814739227294922, + "learning_rate": 8.393602200092121e-07, + "loss": 2.5104, + "step": 9488500 + }, + { + "epoch": 2.949793819079934, + "grad_norm": 10.82650375366211, + "learning_rate": 8.367696820010973e-07, + "loss": 2.5735, + "step": 9489000 + }, + { + "epoch": 2.949949251360421, + "grad_norm": 60.5410270690918, + "learning_rate": 8.341791439929829e-07, + "loss": 2.5529, + "step": 9489500 + }, + { + "epoch": 2.950104683640908, + "grad_norm": 11.102599143981934, + "learning_rate": 8.315886059848681e-07, + "loss": 2.5077, + "step": 9490000 + }, + { + "epoch": 2.950260115921395, + "grad_norm": 9.013894081115723, + "learning_rate": 8.289980679767537e-07, + "loss": 2.5672, + "step": 9490500 + }, + { + "epoch": 2.9504155482018817, + "grad_norm": 9.823522567749023, + "learning_rate": 8.26407529968639e-07, + "loss": 2.5284, + "step": 9491000 + }, + { + "epoch": 2.9505709804823685, + "grad_norm": 11.089028358459473, + "learning_rate": 8.238169919605243e-07, + "loss": 2.5607, + "step": 9491500 + }, + { + "epoch": 2.9507264127628554, + "grad_norm": 32.91123962402344, + "learning_rate": 8.212264539524098e-07, + "loss": 2.541, + "step": 9492000 + }, + { + "epoch": 2.9508818450433423, + "grad_norm": 8.579601287841797, + "learning_rate": 8.186359159442951e-07, + "loss": 2.5585, + "step": 9492500 + }, + { + "epoch": 2.951037277323829, + "grad_norm": 8.965219497680664, + "learning_rate": 8.160453779361806e-07, + "loss": 2.5403, + "step": 9493000 + }, + { + "epoch": 2.951192709604316, + "grad_norm": 12.048830032348633, + "learning_rate": 8.134548399280659e-07, + "loss": 2.5864, + "step": 9493500 + }, + { + "epoch": 2.951348141884803, + "grad_norm": 16.00815773010254, + "learning_rate": 8.108643019199514e-07, + "loss": 2.5472, + "step": 9494000 + }, + { + "epoch": 2.9515035741652897, + "grad_norm": 9.846978187561035, + "learning_rate": 8.082737639118368e-07, + "loss": 2.59, + "step": 9494500 + }, + { + "epoch": 2.9516590064457766, + "grad_norm": 11.23085880279541, + "learning_rate": 8.056832259037222e-07, + "loss": 2.5508, + "step": 9495000 + }, + { + "epoch": 2.9518144387262635, + "grad_norm": 10.39120864868164, + "learning_rate": 8.030926878956076e-07, + "loss": 2.531, + "step": 9495500 + }, + { + "epoch": 2.9519698710067503, + "grad_norm": 8.406571388244629, + "learning_rate": 8.005021498874929e-07, + "loss": 2.5752, + "step": 9496000 + }, + { + "epoch": 2.952125303287237, + "grad_norm": 9.813304901123047, + "learning_rate": 7.979116118793784e-07, + "loss": 2.5488, + "step": 9496500 + }, + { + "epoch": 2.952280735567724, + "grad_norm": 9.072562217712402, + "learning_rate": 7.953210738712637e-07, + "loss": 2.5174, + "step": 9497000 + }, + { + "epoch": 2.952436167848211, + "grad_norm": 13.340097427368164, + "learning_rate": 7.927305358631492e-07, + "loss": 2.4877, + "step": 9497500 + }, + { + "epoch": 2.952591600128698, + "grad_norm": 8.461666107177734, + "learning_rate": 7.901399978550345e-07, + "loss": 2.5953, + "step": 9498000 + }, + { + "epoch": 2.9527470324091847, + "grad_norm": 11.377041816711426, + "learning_rate": 7.8754945984692e-07, + "loss": 2.5901, + "step": 9498500 + }, + { + "epoch": 2.9529024646896715, + "grad_norm": 14.746685028076172, + "learning_rate": 7.849589218388054e-07, + "loss": 2.5814, + "step": 9499000 + }, + { + "epoch": 2.9530578969701584, + "grad_norm": 10.17920207977295, + "learning_rate": 7.823683838306908e-07, + "loss": 2.5705, + "step": 9499500 + }, + { + "epoch": 2.9532133292506453, + "grad_norm": 10.992523193359375, + "learning_rate": 7.797778458225762e-07, + "loss": 2.5314, + "step": 9500000 + }, + { + "epoch": 2.953368761531132, + "grad_norm": 8.803028106689453, + "learning_rate": 7.771873078144615e-07, + "loss": 2.5537, + "step": 9500500 + }, + { + "epoch": 2.953524193811619, + "grad_norm": 10.411258697509766, + "learning_rate": 7.74596769806347e-07, + "loss": 2.595, + "step": 9501000 + }, + { + "epoch": 2.9536796260921063, + "grad_norm": 11.648183822631836, + "learning_rate": 7.720062317982323e-07, + "loss": 2.5549, + "step": 9501500 + }, + { + "epoch": 2.9538350583725927, + "grad_norm": 38.85723876953125, + "learning_rate": 7.694156937901178e-07, + "loss": 2.5417, + "step": 9502000 + }, + { + "epoch": 2.95399049065308, + "grad_norm": 11.172874450683594, + "learning_rate": 7.668251557820031e-07, + "loss": 2.5039, + "step": 9502500 + }, + { + "epoch": 2.9541459229335665, + "grad_norm": 9.19446849822998, + "learning_rate": 7.642346177738885e-07, + "loss": 2.5168, + "step": 9503000 + }, + { + "epoch": 2.954301355214054, + "grad_norm": 11.107792854309082, + "learning_rate": 7.61644079765774e-07, + "loss": 2.5776, + "step": 9503500 + }, + { + "epoch": 2.95445678749454, + "grad_norm": 12.812943458557129, + "learning_rate": 7.590535417576593e-07, + "loss": 2.536, + "step": 9504000 + }, + { + "epoch": 2.9546122197750275, + "grad_norm": 8.57280445098877, + "learning_rate": 7.564630037495448e-07, + "loss": 2.5304, + "step": 9504500 + }, + { + "epoch": 2.954767652055514, + "grad_norm": 14.11327838897705, + "learning_rate": 7.538724657414301e-07, + "loss": 2.5725, + "step": 9505000 + }, + { + "epoch": 2.9549230843360013, + "grad_norm": 10.310990333557129, + "learning_rate": 7.512819277333156e-07, + "loss": 2.5418, + "step": 9505500 + }, + { + "epoch": 2.9550785166164877, + "grad_norm": 30.63482666015625, + "learning_rate": 7.486913897252009e-07, + "loss": 2.5561, + "step": 9506000 + }, + { + "epoch": 2.955233948896975, + "grad_norm": 10.807637214660645, + "learning_rate": 7.461008517170864e-07, + "loss": 2.5469, + "step": 9506500 + }, + { + "epoch": 2.955389381177462, + "grad_norm": 10.576224327087402, + "learning_rate": 7.435103137089718e-07, + "loss": 2.5736, + "step": 9507000 + }, + { + "epoch": 2.9555448134579487, + "grad_norm": 8.926407814025879, + "learning_rate": 7.409197757008572e-07, + "loss": 2.5482, + "step": 9507500 + }, + { + "epoch": 2.9557002457384356, + "grad_norm": 9.539170265197754, + "learning_rate": 7.383292376927425e-07, + "loss": 2.5415, + "step": 9508000 + }, + { + "epoch": 2.9558556780189225, + "grad_norm": 9.293415069580078, + "learning_rate": 7.357386996846279e-07, + "loss": 2.5403, + "step": 9508500 + }, + { + "epoch": 2.9560111102994093, + "grad_norm": 10.200508117675781, + "learning_rate": 7.331481616765133e-07, + "loss": 2.5524, + "step": 9509000 + }, + { + "epoch": 2.956166542579896, + "grad_norm": 9.446690559387207, + "learning_rate": 7.305576236683987e-07, + "loss": 2.5671, + "step": 9509500 + }, + { + "epoch": 2.956321974860383, + "grad_norm": 9.765421867370605, + "learning_rate": 7.279670856602841e-07, + "loss": 2.575, + "step": 9510000 + }, + { + "epoch": 2.95647740714087, + "grad_norm": 9.03207778930664, + "learning_rate": 7.253765476521695e-07, + "loss": 2.5705, + "step": 9510500 + }, + { + "epoch": 2.956632839421357, + "grad_norm": 11.344080924987793, + "learning_rate": 7.227860096440549e-07, + "loss": 2.538, + "step": 9511000 + }, + { + "epoch": 2.9567882717018437, + "grad_norm": 13.84161376953125, + "learning_rate": 7.201954716359404e-07, + "loss": 2.5246, + "step": 9511500 + }, + { + "epoch": 2.9569437039823305, + "grad_norm": 9.217541694641113, + "learning_rate": 7.176049336278257e-07, + "loss": 2.4975, + "step": 9512000 + }, + { + "epoch": 2.9570991362628174, + "grad_norm": 15.22597885131836, + "learning_rate": 7.150143956197111e-07, + "loss": 2.5103, + "step": 9512500 + }, + { + "epoch": 2.9572545685433043, + "grad_norm": 9.787153244018555, + "learning_rate": 7.124238576115965e-07, + "loss": 2.5168, + "step": 9513000 + }, + { + "epoch": 2.957410000823791, + "grad_norm": 16.10236930847168, + "learning_rate": 7.098333196034819e-07, + "loss": 2.5167, + "step": 9513500 + }, + { + "epoch": 2.957565433104278, + "grad_norm": 30.040252685546875, + "learning_rate": 7.072427815953673e-07, + "loss": 2.536, + "step": 9514000 + }, + { + "epoch": 2.957720865384765, + "grad_norm": 12.308187484741211, + "learning_rate": 7.046522435872527e-07, + "loss": 2.5693, + "step": 9514500 + }, + { + "epoch": 2.9578762976652517, + "grad_norm": 9.552066802978516, + "learning_rate": 7.020617055791382e-07, + "loss": 2.5388, + "step": 9515000 + }, + { + "epoch": 2.9580317299457386, + "grad_norm": 11.670571327209473, + "learning_rate": 6.994711675710235e-07, + "loss": 2.5887, + "step": 9515500 + }, + { + "epoch": 2.9581871622262255, + "grad_norm": 11.23208236694336, + "learning_rate": 6.96880629562909e-07, + "loss": 2.5388, + "step": 9516000 + }, + { + "epoch": 2.9583425945067123, + "grad_norm": 10.55915641784668, + "learning_rate": 6.942900915547943e-07, + "loss": 2.5688, + "step": 9516500 + }, + { + "epoch": 2.958498026787199, + "grad_norm": 8.675676345825195, + "learning_rate": 6.916995535466798e-07, + "loss": 2.5477, + "step": 9517000 + }, + { + "epoch": 2.958653459067686, + "grad_norm": 10.338932991027832, + "learning_rate": 6.89109015538565e-07, + "loss": 2.5261, + "step": 9517500 + }, + { + "epoch": 2.958808891348173, + "grad_norm": 8.074963569641113, + "learning_rate": 6.865184775304505e-07, + "loss": 2.5056, + "step": 9518000 + }, + { + "epoch": 2.95896432362866, + "grad_norm": 12.86306381225586, + "learning_rate": 6.839279395223359e-07, + "loss": 2.552, + "step": 9518500 + }, + { + "epoch": 2.9591197559091467, + "grad_norm": 9.981929779052734, + "learning_rate": 6.813374015142213e-07, + "loss": 2.4955, + "step": 9519000 + }, + { + "epoch": 2.9592751881896335, + "grad_norm": 12.456491470336914, + "learning_rate": 6.787468635061068e-07, + "loss": 2.5185, + "step": 9519500 + }, + { + "epoch": 2.9594306204701204, + "grad_norm": 13.703752517700195, + "learning_rate": 6.761563254979921e-07, + "loss": 2.5537, + "step": 9520000 + }, + { + "epoch": 2.9595860527506073, + "grad_norm": 10.826987266540527, + "learning_rate": 6.735657874898776e-07, + "loss": 2.5272, + "step": 9520500 + }, + { + "epoch": 2.959741485031094, + "grad_norm": 10.710824012756348, + "learning_rate": 6.709752494817629e-07, + "loss": 2.5977, + "step": 9521000 + }, + { + "epoch": 2.959896917311581, + "grad_norm": 7.647745609283447, + "learning_rate": 6.683847114736484e-07, + "loss": 2.5452, + "step": 9521500 + }, + { + "epoch": 2.960052349592068, + "grad_norm": 10.716241836547852, + "learning_rate": 6.657941734655337e-07, + "loss": 2.5167, + "step": 9522000 + }, + { + "epoch": 2.9602077818725547, + "grad_norm": 12.636094093322754, + "learning_rate": 6.632036354574191e-07, + "loss": 2.5123, + "step": 9522500 + }, + { + "epoch": 2.9603632141530416, + "grad_norm": 12.124274253845215, + "learning_rate": 6.606130974493044e-07, + "loss": 2.5577, + "step": 9523000 + }, + { + "epoch": 2.9605186464335285, + "grad_norm": 10.241275787353516, + "learning_rate": 6.580225594411899e-07, + "loss": 2.5866, + "step": 9523500 + }, + { + "epoch": 2.9606740787140153, + "grad_norm": 11.82462215423584, + "learning_rate": 6.554320214330753e-07, + "loss": 2.5876, + "step": 9524000 + }, + { + "epoch": 2.960829510994502, + "grad_norm": 9.732268333435059, + "learning_rate": 6.528414834249607e-07, + "loss": 2.5355, + "step": 9524500 + }, + { + "epoch": 2.9609849432749895, + "grad_norm": 17.7093448638916, + "learning_rate": 6.502509454168461e-07, + "loss": 2.571, + "step": 9525000 + }, + { + "epoch": 2.961140375555476, + "grad_norm": 20.74256134033203, + "learning_rate": 6.476604074087315e-07, + "loss": 2.4954, + "step": 9525500 + }, + { + "epoch": 2.9612958078359632, + "grad_norm": 8.863038063049316, + "learning_rate": 6.450698694006169e-07, + "loss": 2.5495, + "step": 9526000 + }, + { + "epoch": 2.9614512401164497, + "grad_norm": 10.87711238861084, + "learning_rate": 6.424793313925023e-07, + "loss": 2.5557, + "step": 9526500 + }, + { + "epoch": 2.961606672396937, + "grad_norm": 11.97116756439209, + "learning_rate": 6.398887933843877e-07, + "loss": 2.5633, + "step": 9527000 + }, + { + "epoch": 2.9617621046774234, + "grad_norm": 8.955004692077637, + "learning_rate": 6.37298255376273e-07, + "loss": 2.5321, + "step": 9527500 + }, + { + "epoch": 2.9619175369579107, + "grad_norm": 8.59365177154541, + "learning_rate": 6.347077173681585e-07, + "loss": 2.5289, + "step": 9528000 + }, + { + "epoch": 2.962072969238397, + "grad_norm": 9.699736595153809, + "learning_rate": 6.321171793600439e-07, + "loss": 2.5971, + "step": 9528500 + }, + { + "epoch": 2.9622284015188844, + "grad_norm": 24.809484481811523, + "learning_rate": 6.295266413519293e-07, + "loss": 2.6095, + "step": 9529000 + }, + { + "epoch": 2.962383833799371, + "grad_norm": 8.530774116516113, + "learning_rate": 6.269361033438147e-07, + "loss": 2.579, + "step": 9529500 + }, + { + "epoch": 2.962539266079858, + "grad_norm": 10.015511512756348, + "learning_rate": 6.243455653357001e-07, + "loss": 2.5051, + "step": 9530000 + }, + { + "epoch": 2.9626946983603446, + "grad_norm": 10.639449119567871, + "learning_rate": 6.217550273275855e-07, + "loss": 2.5212, + "step": 9530500 + }, + { + "epoch": 2.962850130640832, + "grad_norm": 10.264286041259766, + "learning_rate": 6.191644893194709e-07, + "loss": 2.5508, + "step": 9531000 + }, + { + "epoch": 2.963005562921319, + "grad_norm": 11.513893127441406, + "learning_rate": 6.165739513113563e-07, + "loss": 2.5063, + "step": 9531500 + }, + { + "epoch": 2.9631609952018056, + "grad_norm": 11.17349910736084, + "learning_rate": 6.139834133032416e-07, + "loss": 2.5993, + "step": 9532000 + }, + { + "epoch": 2.9633164274822925, + "grad_norm": 12.228732109069824, + "learning_rate": 6.11392875295127e-07, + "loss": 2.5378, + "step": 9532500 + }, + { + "epoch": 2.9634718597627794, + "grad_norm": 9.877158164978027, + "learning_rate": 6.088023372870125e-07, + "loss": 2.527, + "step": 9533000 + }, + { + "epoch": 2.9636272920432662, + "grad_norm": 16.54619598388672, + "learning_rate": 6.062117992788978e-07, + "loss": 2.5483, + "step": 9533500 + }, + { + "epoch": 2.963782724323753, + "grad_norm": 10.998270034790039, + "learning_rate": 6.036212612707833e-07, + "loss": 2.5728, + "step": 9534000 + }, + { + "epoch": 2.96393815660424, + "grad_norm": 23.020288467407227, + "learning_rate": 6.010307232626686e-07, + "loss": 2.5209, + "step": 9534500 + }, + { + "epoch": 2.964093588884727, + "grad_norm": 9.304927825927734, + "learning_rate": 5.984401852545541e-07, + "loss": 2.52, + "step": 9535000 + }, + { + "epoch": 2.9642490211652137, + "grad_norm": 9.253397941589355, + "learning_rate": 5.958496472464394e-07, + "loss": 2.5421, + "step": 9535500 + }, + { + "epoch": 2.9644044534457006, + "grad_norm": 8.84877872467041, + "learning_rate": 5.932591092383249e-07, + "loss": 2.5344, + "step": 9536000 + }, + { + "epoch": 2.9645598857261874, + "grad_norm": 9.57837963104248, + "learning_rate": 5.906685712302103e-07, + "loss": 2.5624, + "step": 9536500 + }, + { + "epoch": 2.9647153180066743, + "grad_norm": 10.770204544067383, + "learning_rate": 5.880780332220956e-07, + "loss": 2.5542, + "step": 9537000 + }, + { + "epoch": 2.964870750287161, + "grad_norm": 13.607057571411133, + "learning_rate": 5.854874952139811e-07, + "loss": 2.5639, + "step": 9537500 + }, + { + "epoch": 2.965026182567648, + "grad_norm": 8.512081146240234, + "learning_rate": 5.828969572058664e-07, + "loss": 2.5415, + "step": 9538000 + }, + { + "epoch": 2.965181614848135, + "grad_norm": 9.292428970336914, + "learning_rate": 5.803064191977519e-07, + "loss": 2.5791, + "step": 9538500 + }, + { + "epoch": 2.965337047128622, + "grad_norm": 12.637025833129883, + "learning_rate": 5.777158811896372e-07, + "loss": 2.5677, + "step": 9539000 + }, + { + "epoch": 2.9654924794091087, + "grad_norm": 26.830894470214844, + "learning_rate": 5.751253431815227e-07, + "loss": 2.5588, + "step": 9539500 + }, + { + "epoch": 2.9656479116895955, + "grad_norm": 8.787605285644531, + "learning_rate": 5.72534805173408e-07, + "loss": 2.525, + "step": 9540000 + }, + { + "epoch": 2.9658033439700824, + "grad_norm": 10.217330932617188, + "learning_rate": 5.699442671652935e-07, + "loss": 2.5533, + "step": 9540500 + }, + { + "epoch": 2.9659587762505693, + "grad_norm": 9.738751411437988, + "learning_rate": 5.673537291571789e-07, + "loss": 2.5541, + "step": 9541000 + }, + { + "epoch": 2.966114208531056, + "grad_norm": 13.704972267150879, + "learning_rate": 5.647631911490642e-07, + "loss": 2.5416, + "step": 9541500 + }, + { + "epoch": 2.966269640811543, + "grad_norm": 7.760972499847412, + "learning_rate": 5.621726531409497e-07, + "loss": 2.5499, + "step": 9542000 + }, + { + "epoch": 2.96642507309203, + "grad_norm": 10.728918075561523, + "learning_rate": 5.59582115132835e-07, + "loss": 2.5228, + "step": 9542500 + }, + { + "epoch": 2.9665805053725167, + "grad_norm": 10.035588264465332, + "learning_rate": 5.569915771247205e-07, + "loss": 2.4902, + "step": 9543000 + }, + { + "epoch": 2.9667359376530036, + "grad_norm": 9.775123596191406, + "learning_rate": 5.544010391166058e-07, + "loss": 2.5472, + "step": 9543500 + }, + { + "epoch": 2.9668913699334905, + "grad_norm": 40.31206512451172, + "learning_rate": 5.518105011084913e-07, + "loss": 2.5701, + "step": 9544000 + }, + { + "epoch": 2.9670468022139773, + "grad_norm": 15.013649940490723, + "learning_rate": 5.492199631003766e-07, + "loss": 2.5858, + "step": 9544500 + }, + { + "epoch": 2.967202234494464, + "grad_norm": 36.92235565185547, + "learning_rate": 5.466294250922621e-07, + "loss": 2.5826, + "step": 9545000 + }, + { + "epoch": 2.967357666774951, + "grad_norm": 8.953486442565918, + "learning_rate": 5.440388870841475e-07, + "loss": 2.5704, + "step": 9545500 + }, + { + "epoch": 2.967513099055438, + "grad_norm": 14.314534187316895, + "learning_rate": 5.414483490760328e-07, + "loss": 2.5454, + "step": 9546000 + }, + { + "epoch": 2.967668531335925, + "grad_norm": 12.233436584472656, + "learning_rate": 5.388578110679182e-07, + "loss": 2.5585, + "step": 9546500 + }, + { + "epoch": 2.9678239636164117, + "grad_norm": 11.59201431274414, + "learning_rate": 5.362672730598036e-07, + "loss": 2.5382, + "step": 9547000 + }, + { + "epoch": 2.9679793958968985, + "grad_norm": 9.162068367004395, + "learning_rate": 5.33676735051689e-07, + "loss": 2.5219, + "step": 9547500 + }, + { + "epoch": 2.9681348281773854, + "grad_norm": 11.924562454223633, + "learning_rate": 5.310861970435744e-07, + "loss": 2.5624, + "step": 9548000 + }, + { + "epoch": 2.9682902604578723, + "grad_norm": 15.723865509033203, + "learning_rate": 5.284956590354598e-07, + "loss": 2.5674, + "step": 9548500 + }, + { + "epoch": 2.968445692738359, + "grad_norm": 15.6532621383667, + "learning_rate": 5.259051210273453e-07, + "loss": 2.5629, + "step": 9549000 + }, + { + "epoch": 2.9686011250188464, + "grad_norm": 16.04250144958496, + "learning_rate": 5.233145830192306e-07, + "loss": 2.566, + "step": 9549500 + }, + { + "epoch": 2.968756557299333, + "grad_norm": 11.281808853149414, + "learning_rate": 5.207240450111161e-07, + "loss": 2.5422, + "step": 9550000 + }, + { + "epoch": 2.96891198957982, + "grad_norm": 9.348243713378906, + "learning_rate": 5.181335070030014e-07, + "loss": 2.5389, + "step": 9550500 + }, + { + "epoch": 2.9690674218603066, + "grad_norm": 7.031998634338379, + "learning_rate": 5.155429689948868e-07, + "loss": 2.5681, + "step": 9551000 + }, + { + "epoch": 2.969222854140794, + "grad_norm": 10.89378833770752, + "learning_rate": 5.129524309867722e-07, + "loss": 2.5265, + "step": 9551500 + }, + { + "epoch": 2.9693782864212803, + "grad_norm": 27.480785369873047, + "learning_rate": 5.103618929786576e-07, + "loss": 2.5102, + "step": 9552000 + }, + { + "epoch": 2.9695337187017676, + "grad_norm": 16.1590633392334, + "learning_rate": 5.07771354970543e-07, + "loss": 2.5698, + "step": 9552500 + }, + { + "epoch": 2.969689150982254, + "grad_norm": 17.6351375579834, + "learning_rate": 5.051808169624284e-07, + "loss": 2.5885, + "step": 9553000 + }, + { + "epoch": 2.9698445832627414, + "grad_norm": 11.819221496582031, + "learning_rate": 5.025902789543139e-07, + "loss": 2.6204, + "step": 9553500 + }, + { + "epoch": 2.970000015543228, + "grad_norm": 10.999786376953125, + "learning_rate": 4.999997409461992e-07, + "loss": 2.5461, + "step": 9554000 + }, + { + "epoch": 2.970155447823715, + "grad_norm": 9.532320022583008, + "learning_rate": 4.974092029380847e-07, + "loss": 2.5465, + "step": 9554500 + }, + { + "epoch": 2.970310880104202, + "grad_norm": 10.661921501159668, + "learning_rate": 4.9481866492997e-07, + "loss": 2.5401, + "step": 9555000 + }, + { + "epoch": 2.970466312384689, + "grad_norm": 8.967796325683594, + "learning_rate": 4.922281269218555e-07, + "loss": 2.5518, + "step": 9555500 + }, + { + "epoch": 2.9706217446651757, + "grad_norm": 10.778791427612305, + "learning_rate": 4.896375889137407e-07, + "loss": 2.5762, + "step": 9556000 + }, + { + "epoch": 2.9707771769456626, + "grad_norm": 10.127313613891602, + "learning_rate": 4.870470509056262e-07, + "loss": 2.5244, + "step": 9556500 + }, + { + "epoch": 2.9709326092261494, + "grad_norm": 12.76952838897705, + "learning_rate": 4.844565128975115e-07, + "loss": 2.5633, + "step": 9557000 + }, + { + "epoch": 2.9710880415066363, + "grad_norm": 11.746516227722168, + "learning_rate": 4.81865974889397e-07, + "loss": 2.5367, + "step": 9557500 + }, + { + "epoch": 2.971243473787123, + "grad_norm": 8.153824806213379, + "learning_rate": 4.792754368812824e-07, + "loss": 2.5586, + "step": 9558000 + }, + { + "epoch": 2.97139890606761, + "grad_norm": 10.54198932647705, + "learning_rate": 4.766848988731678e-07, + "loss": 2.5618, + "step": 9558500 + }, + { + "epoch": 2.971554338348097, + "grad_norm": 10.075435638427734, + "learning_rate": 4.740943608650532e-07, + "loss": 2.5202, + "step": 9559000 + }, + { + "epoch": 2.9717097706285838, + "grad_norm": 13.582722663879395, + "learning_rate": 4.7150382285693863e-07, + "loss": 2.5192, + "step": 9559500 + }, + { + "epoch": 2.9718652029090706, + "grad_norm": 9.140120506286621, + "learning_rate": 4.6891328484882403e-07, + "loss": 2.5542, + "step": 9560000 + }, + { + "epoch": 2.9720206351895575, + "grad_norm": 10.03538990020752, + "learning_rate": 4.6632274684070933e-07, + "loss": 2.5599, + "step": 9560500 + }, + { + "epoch": 2.9721760674700444, + "grad_norm": 9.613263130187988, + "learning_rate": 4.6373220883259474e-07, + "loss": 2.5403, + "step": 9561000 + }, + { + "epoch": 2.9723314997505312, + "grad_norm": 19.792469024658203, + "learning_rate": 4.6114167082448015e-07, + "loss": 2.5777, + "step": 9561500 + }, + { + "epoch": 2.972486932031018, + "grad_norm": 9.008800506591797, + "learning_rate": 4.5855113281636555e-07, + "loss": 2.5803, + "step": 9562000 + }, + { + "epoch": 2.972642364311505, + "grad_norm": 9.438431739807129, + "learning_rate": 4.5596059480825096e-07, + "loss": 2.5035, + "step": 9562500 + }, + { + "epoch": 2.972797796591992, + "grad_norm": 7.662083148956299, + "learning_rate": 4.5337005680013637e-07, + "loss": 2.5438, + "step": 9563000 + }, + { + "epoch": 2.9729532288724787, + "grad_norm": 12.383275985717773, + "learning_rate": 4.507795187920218e-07, + "loss": 2.5649, + "step": 9563500 + }, + { + "epoch": 2.9731086611529656, + "grad_norm": 10.044651985168457, + "learning_rate": 4.4818898078390723e-07, + "loss": 2.5845, + "step": 9564000 + }, + { + "epoch": 2.9732640934334524, + "grad_norm": 8.517440795898438, + "learning_rate": 4.4559844277579264e-07, + "loss": 2.5448, + "step": 9564500 + }, + { + "epoch": 2.9734195257139393, + "grad_norm": 42.01280975341797, + "learning_rate": 4.4300790476767804e-07, + "loss": 2.5594, + "step": 9565000 + }, + { + "epoch": 2.973574957994426, + "grad_norm": 9.589709281921387, + "learning_rate": 4.4041736675956335e-07, + "loss": 2.5135, + "step": 9565500 + }, + { + "epoch": 2.973730390274913, + "grad_norm": 8.909619331359863, + "learning_rate": 4.3782682875144875e-07, + "loss": 2.5442, + "step": 9566000 + }, + { + "epoch": 2.9738858225554, + "grad_norm": 11.155740737915039, + "learning_rate": 4.3523629074333416e-07, + "loss": 2.5369, + "step": 9566500 + }, + { + "epoch": 2.9740412548358868, + "grad_norm": 12.474522590637207, + "learning_rate": 4.3264575273521956e-07, + "loss": 2.5362, + "step": 9567000 + }, + { + "epoch": 2.9741966871163736, + "grad_norm": 13.246380805969238, + "learning_rate": 4.3005521472710497e-07, + "loss": 2.5664, + "step": 9567500 + }, + { + "epoch": 2.9743521193968605, + "grad_norm": 10.231511116027832, + "learning_rate": 4.274646767189904e-07, + "loss": 2.5875, + "step": 9568000 + }, + { + "epoch": 2.9745075516773474, + "grad_norm": 9.189955711364746, + "learning_rate": 4.248741387108758e-07, + "loss": 2.5149, + "step": 9568500 + }, + { + "epoch": 2.9746629839578342, + "grad_norm": 11.075998306274414, + "learning_rate": 4.222836007027612e-07, + "loss": 2.5128, + "step": 9569000 + }, + { + "epoch": 2.974818416238321, + "grad_norm": 10.591747283935547, + "learning_rate": 4.196930626946466e-07, + "loss": 2.5115, + "step": 9569500 + }, + { + "epoch": 2.974973848518808, + "grad_norm": 9.895319938659668, + "learning_rate": 4.1710252468653195e-07, + "loss": 2.4856, + "step": 9570000 + }, + { + "epoch": 2.975129280799295, + "grad_norm": 21.103742599487305, + "learning_rate": 4.1451198667841736e-07, + "loss": 2.5699, + "step": 9570500 + }, + { + "epoch": 2.9752847130797817, + "grad_norm": 9.916543960571289, + "learning_rate": 4.1192144867030276e-07, + "loss": 2.5649, + "step": 9571000 + }, + { + "epoch": 2.9754401453602686, + "grad_norm": 18.409725189208984, + "learning_rate": 4.0933091066218817e-07, + "loss": 2.5345, + "step": 9571500 + }, + { + "epoch": 2.9755955776407554, + "grad_norm": 12.717531204223633, + "learning_rate": 4.067403726540736e-07, + "loss": 2.5567, + "step": 9572000 + }, + { + "epoch": 2.9757510099212423, + "grad_norm": 9.801141738891602, + "learning_rate": 4.04149834645959e-07, + "loss": 2.5116, + "step": 9572500 + }, + { + "epoch": 2.975906442201729, + "grad_norm": 10.98553466796875, + "learning_rate": 4.015592966378444e-07, + "loss": 2.5223, + "step": 9573000 + }, + { + "epoch": 2.976061874482216, + "grad_norm": 9.235319137573242, + "learning_rate": 3.989687586297298e-07, + "loss": 2.5237, + "step": 9573500 + }, + { + "epoch": 2.9762173067627034, + "grad_norm": 9.913247108459473, + "learning_rate": 3.963782206216152e-07, + "loss": 2.5538, + "step": 9574000 + }, + { + "epoch": 2.9763727390431898, + "grad_norm": 8.44636058807373, + "learning_rate": 3.937876826135006e-07, + "loss": 2.5901, + "step": 9574500 + }, + { + "epoch": 2.976528171323677, + "grad_norm": 10.107255935668945, + "learning_rate": 3.911971446053859e-07, + "loss": 2.5185, + "step": 9575000 + }, + { + "epoch": 2.9766836036041635, + "grad_norm": 12.415229797363281, + "learning_rate": 3.8860660659727137e-07, + "loss": 2.5697, + "step": 9575500 + }, + { + "epoch": 2.976839035884651, + "grad_norm": 8.15087604522705, + "learning_rate": 3.860160685891567e-07, + "loss": 2.5647, + "step": 9576000 + }, + { + "epoch": 2.9769944681651372, + "grad_norm": 10.334688186645508, + "learning_rate": 3.834255305810421e-07, + "loss": 2.5474, + "step": 9576500 + }, + { + "epoch": 2.9771499004456246, + "grad_norm": 27.788124084472656, + "learning_rate": 3.8083499257292753e-07, + "loss": 2.5073, + "step": 9577000 + }, + { + "epoch": 2.977305332726111, + "grad_norm": 10.2301664352417, + "learning_rate": 3.7824445456481294e-07, + "loss": 2.5566, + "step": 9577500 + }, + { + "epoch": 2.9774607650065983, + "grad_norm": 10.14876937866211, + "learning_rate": 3.7565391655669835e-07, + "loss": 2.4928, + "step": 9578000 + }, + { + "epoch": 2.9776161972870847, + "grad_norm": 10.483304023742676, + "learning_rate": 3.7306337854858375e-07, + "loss": 2.5349, + "step": 9578500 + }, + { + "epoch": 2.977771629567572, + "grad_norm": 10.093409538269043, + "learning_rate": 3.7047284054046916e-07, + "loss": 2.5548, + "step": 9579000 + }, + { + "epoch": 2.977927061848059, + "grad_norm": 12.016311645507812, + "learning_rate": 3.6788230253235456e-07, + "loss": 2.5909, + "step": 9579500 + }, + { + "epoch": 2.9780824941285458, + "grad_norm": 10.166821479797363, + "learning_rate": 3.6529176452423997e-07, + "loss": 2.5684, + "step": 9580000 + }, + { + "epoch": 2.9782379264090326, + "grad_norm": 16.018203735351562, + "learning_rate": 3.627012265161254e-07, + "loss": 2.5661, + "step": 9580500 + }, + { + "epoch": 2.9783933586895195, + "grad_norm": 9.010330200195312, + "learning_rate": 3.6011068850801073e-07, + "loss": 2.5162, + "step": 9581000 + }, + { + "epoch": 2.9785487909700064, + "grad_norm": 8.870890617370605, + "learning_rate": 3.5752015049989614e-07, + "loss": 2.5734, + "step": 9581500 + }, + { + "epoch": 2.978704223250493, + "grad_norm": 13.8971586227417, + "learning_rate": 3.5492961249178154e-07, + "loss": 2.5242, + "step": 9582000 + }, + { + "epoch": 2.97885965553098, + "grad_norm": 10.573172569274902, + "learning_rate": 3.5233907448366695e-07, + "loss": 2.5825, + "step": 9582500 + }, + { + "epoch": 2.979015087811467, + "grad_norm": 10.261693000793457, + "learning_rate": 3.4974853647555236e-07, + "loss": 2.5572, + "step": 9583000 + }, + { + "epoch": 2.979170520091954, + "grad_norm": 10.765734672546387, + "learning_rate": 3.471579984674377e-07, + "loss": 2.5772, + "step": 9583500 + }, + { + "epoch": 2.9793259523724407, + "grad_norm": 8.50390338897705, + "learning_rate": 3.445674604593231e-07, + "loss": 2.517, + "step": 9584000 + }, + { + "epoch": 2.9794813846529276, + "grad_norm": 10.791939735412598, + "learning_rate": 3.419769224512085e-07, + "loss": 2.515, + "step": 9584500 + }, + { + "epoch": 2.9796368169334144, + "grad_norm": 13.373839378356934, + "learning_rate": 3.3938638444309393e-07, + "loss": 2.5787, + "step": 9585000 + }, + { + "epoch": 2.9797922492139013, + "grad_norm": 11.336068153381348, + "learning_rate": 3.367958464349793e-07, + "loss": 2.5475, + "step": 9585500 + }, + { + "epoch": 2.979947681494388, + "grad_norm": 9.97149658203125, + "learning_rate": 3.342053084268647e-07, + "loss": 2.583, + "step": 9586000 + }, + { + "epoch": 2.980103113774875, + "grad_norm": 10.35996150970459, + "learning_rate": 3.316147704187501e-07, + "loss": 2.533, + "step": 9586500 + }, + { + "epoch": 2.980258546055362, + "grad_norm": 12.44159984588623, + "learning_rate": 3.290242324106355e-07, + "loss": 2.5389, + "step": 9587000 + }, + { + "epoch": 2.9804139783358488, + "grad_norm": 11.394270896911621, + "learning_rate": 3.2643369440252096e-07, + "loss": 2.5378, + "step": 9587500 + }, + { + "epoch": 2.9805694106163356, + "grad_norm": 9.538310050964355, + "learning_rate": 3.238431563944063e-07, + "loss": 2.5766, + "step": 9588000 + }, + { + "epoch": 2.9807248428968225, + "grad_norm": 8.9402437210083, + "learning_rate": 3.212526183862917e-07, + "loss": 2.5613, + "step": 9588500 + }, + { + "epoch": 2.9808802751773094, + "grad_norm": 17.56945037841797, + "learning_rate": 3.1866208037817713e-07, + "loss": 2.5318, + "step": 9589000 + }, + { + "epoch": 2.9810357074577962, + "grad_norm": 10.223602294921875, + "learning_rate": 3.1607154237006253e-07, + "loss": 2.5252, + "step": 9589500 + }, + { + "epoch": 2.981191139738283, + "grad_norm": 11.623954772949219, + "learning_rate": 3.1348100436194794e-07, + "loss": 2.5648, + "step": 9590000 + }, + { + "epoch": 2.98134657201877, + "grad_norm": 10.642101287841797, + "learning_rate": 3.108904663538333e-07, + "loss": 2.536, + "step": 9590500 + }, + { + "epoch": 2.981502004299257, + "grad_norm": 9.293224334716797, + "learning_rate": 3.082999283457187e-07, + "loss": 2.5347, + "step": 9591000 + }, + { + "epoch": 2.9816574365797437, + "grad_norm": 40.11482238769531, + "learning_rate": 3.057093903376041e-07, + "loss": 2.5637, + "step": 9591500 + }, + { + "epoch": 2.9818128688602306, + "grad_norm": 9.72925090789795, + "learning_rate": 3.031188523294895e-07, + "loss": 2.5299, + "step": 9592000 + }, + { + "epoch": 2.9819683011407174, + "grad_norm": 9.919795989990234, + "learning_rate": 3.005283143213749e-07, + "loss": 2.5213, + "step": 9592500 + }, + { + "epoch": 2.9821237334212043, + "grad_norm": 9.843976020812988, + "learning_rate": 2.9793777631326027e-07, + "loss": 2.5971, + "step": 9593000 + }, + { + "epoch": 2.982279165701691, + "grad_norm": 8.506977081298828, + "learning_rate": 2.953472383051457e-07, + "loss": 2.57, + "step": 9593500 + }, + { + "epoch": 2.982434597982178, + "grad_norm": 9.88122272491455, + "learning_rate": 2.927567002970311e-07, + "loss": 2.5332, + "step": 9594000 + }, + { + "epoch": 2.982590030262665, + "grad_norm": 11.620176315307617, + "learning_rate": 2.901661622889165e-07, + "loss": 2.5172, + "step": 9594500 + }, + { + "epoch": 2.9827454625431518, + "grad_norm": 13.85561466217041, + "learning_rate": 2.875756242808019e-07, + "loss": 2.5341, + "step": 9595000 + }, + { + "epoch": 2.9829008948236386, + "grad_norm": 11.02525806427002, + "learning_rate": 2.849850862726873e-07, + "loss": 2.555, + "step": 9595500 + }, + { + "epoch": 2.9830563271041255, + "grad_norm": 12.813226699829102, + "learning_rate": 2.823945482645727e-07, + "loss": 2.5304, + "step": 9596000 + }, + { + "epoch": 2.9832117593846124, + "grad_norm": 12.864847183227539, + "learning_rate": 2.798040102564581e-07, + "loss": 2.5788, + "step": 9596500 + }, + { + "epoch": 2.9833671916650992, + "grad_norm": 9.46646499633789, + "learning_rate": 2.772134722483435e-07, + "loss": 2.5506, + "step": 9597000 + }, + { + "epoch": 2.9835226239455865, + "grad_norm": 11.155564308166504, + "learning_rate": 2.746229342402289e-07, + "loss": 2.5613, + "step": 9597500 + }, + { + "epoch": 2.983678056226073, + "grad_norm": 8.656083106994629, + "learning_rate": 2.720323962321143e-07, + "loss": 2.5282, + "step": 9598000 + }, + { + "epoch": 2.9838334885065603, + "grad_norm": 11.713391304016113, + "learning_rate": 2.694418582239997e-07, + "loss": 2.5447, + "step": 9598500 + }, + { + "epoch": 2.9839889207870467, + "grad_norm": 10.154376029968262, + "learning_rate": 2.668513202158851e-07, + "loss": 2.5443, + "step": 9599000 + }, + { + "epoch": 2.984144353067534, + "grad_norm": 12.601530075073242, + "learning_rate": 2.642607822077705e-07, + "loss": 2.5543, + "step": 9599500 + }, + { + "epoch": 2.9842997853480204, + "grad_norm": 10.422523498535156, + "learning_rate": 2.6167024419965585e-07, + "loss": 2.537, + "step": 9600000 + }, + { + "epoch": 2.9844552176285077, + "grad_norm": 12.930542945861816, + "learning_rate": 2.5907970619154126e-07, + "loss": 2.5491, + "step": 9600500 + }, + { + "epoch": 2.984610649908994, + "grad_norm": 9.980289459228516, + "learning_rate": 2.5648916818342667e-07, + "loss": 2.5645, + "step": 9601000 + }, + { + "epoch": 2.9847660821894815, + "grad_norm": 11.189658164978027, + "learning_rate": 2.538986301753121e-07, + "loss": 2.5788, + "step": 9601500 + }, + { + "epoch": 2.984921514469968, + "grad_norm": 21.833066940307617, + "learning_rate": 2.513080921671975e-07, + "loss": 2.5941, + "step": 9602000 + }, + { + "epoch": 2.985076946750455, + "grad_norm": 8.606268882751465, + "learning_rate": 2.487175541590829e-07, + "loss": 2.5854, + "step": 9602500 + }, + { + "epoch": 2.9852323790309416, + "grad_norm": 11.330849647521973, + "learning_rate": 2.461270161509683e-07, + "loss": 2.5619, + "step": 9603000 + }, + { + "epoch": 2.985387811311429, + "grad_norm": 17.259788513183594, + "learning_rate": 2.435364781428537e-07, + "loss": 2.5871, + "step": 9603500 + }, + { + "epoch": 2.985543243591916, + "grad_norm": 10.144144058227539, + "learning_rate": 2.409459401347391e-07, + "loss": 2.5583, + "step": 9604000 + }, + { + "epoch": 2.9856986758724027, + "grad_norm": 51.96742248535156, + "learning_rate": 2.3835540212662449e-07, + "loss": 2.5375, + "step": 9604500 + }, + { + "epoch": 2.9858541081528895, + "grad_norm": 10.412690162658691, + "learning_rate": 2.3576486411850987e-07, + "loss": 2.5942, + "step": 9605000 + }, + { + "epoch": 2.9860095404333764, + "grad_norm": 11.194098472595215, + "learning_rate": 2.3317432611039527e-07, + "loss": 2.5406, + "step": 9605500 + }, + { + "epoch": 2.9861649727138633, + "grad_norm": 10.052042007446289, + "learning_rate": 2.3058378810228068e-07, + "loss": 2.4943, + "step": 9606000 + }, + { + "epoch": 2.98632040499435, + "grad_norm": 8.878747940063477, + "learning_rate": 2.2799325009416608e-07, + "loss": 2.5529, + "step": 9606500 + }, + { + "epoch": 2.986475837274837, + "grad_norm": 6.35205602645874, + "learning_rate": 2.254027120860515e-07, + "loss": 2.5785, + "step": 9607000 + }, + { + "epoch": 2.986631269555324, + "grad_norm": 26.237688064575195, + "learning_rate": 2.2281217407793684e-07, + "loss": 2.551, + "step": 9607500 + }, + { + "epoch": 2.9867867018358107, + "grad_norm": 12.108182907104492, + "learning_rate": 2.2022163606982225e-07, + "loss": 2.5414, + "step": 9608000 + }, + { + "epoch": 2.9869421341162976, + "grad_norm": 11.157447814941406, + "learning_rate": 2.1763109806170766e-07, + "loss": 2.5409, + "step": 9608500 + }, + { + "epoch": 2.9870975663967845, + "grad_norm": 11.366740226745605, + "learning_rate": 2.1504056005359306e-07, + "loss": 2.5657, + "step": 9609000 + }, + { + "epoch": 2.9872529986772713, + "grad_norm": 10.566927909851074, + "learning_rate": 2.1245002204547844e-07, + "loss": 2.5569, + "step": 9609500 + }, + { + "epoch": 2.987408430957758, + "grad_norm": 11.025724411010742, + "learning_rate": 2.0985948403736385e-07, + "loss": 2.5591, + "step": 9610000 + }, + { + "epoch": 2.987563863238245, + "grad_norm": 9.862140655517578, + "learning_rate": 2.0726894602924926e-07, + "loss": 2.5178, + "step": 9610500 + }, + { + "epoch": 2.987719295518732, + "grad_norm": 12.560545921325684, + "learning_rate": 2.0467840802113466e-07, + "loss": 2.5427, + "step": 9611000 + }, + { + "epoch": 2.987874727799219, + "grad_norm": 9.686115264892578, + "learning_rate": 2.0208787001302007e-07, + "loss": 2.5233, + "step": 9611500 + }, + { + "epoch": 2.9880301600797057, + "grad_norm": 12.178911209106445, + "learning_rate": 1.9949733200490542e-07, + "loss": 2.5428, + "step": 9612000 + }, + { + "epoch": 2.9881855923601925, + "grad_norm": 55.65264892578125, + "learning_rate": 1.9690679399679083e-07, + "loss": 2.5159, + "step": 9612500 + }, + { + "epoch": 2.9883410246406794, + "grad_norm": 13.531757354736328, + "learning_rate": 1.9431625598867626e-07, + "loss": 2.5272, + "step": 9613000 + }, + { + "epoch": 2.9884964569211663, + "grad_norm": 9.197335243225098, + "learning_rate": 1.9172571798056167e-07, + "loss": 2.5883, + "step": 9613500 + }, + { + "epoch": 2.988651889201653, + "grad_norm": 11.819212913513184, + "learning_rate": 1.8913517997244705e-07, + "loss": 2.5356, + "step": 9614000 + }, + { + "epoch": 2.98880732148214, + "grad_norm": 26.645862579345703, + "learning_rate": 1.8654464196433245e-07, + "loss": 2.5472, + "step": 9614500 + }, + { + "epoch": 2.988962753762627, + "grad_norm": 9.074607849121094, + "learning_rate": 1.8395410395621783e-07, + "loss": 2.5019, + "step": 9615000 + }, + { + "epoch": 2.9891181860431137, + "grad_norm": 10.392667770385742, + "learning_rate": 1.8136356594810324e-07, + "loss": 2.5333, + "step": 9615500 + }, + { + "epoch": 2.9892736183236006, + "grad_norm": 10.325907707214355, + "learning_rate": 1.7877302793998865e-07, + "loss": 2.5261, + "step": 9616000 + }, + { + "epoch": 2.9894290506040875, + "grad_norm": 11.34564208984375, + "learning_rate": 1.7618248993187403e-07, + "loss": 2.5479, + "step": 9616500 + }, + { + "epoch": 2.9895844828845743, + "grad_norm": 8.89979362487793, + "learning_rate": 1.7359195192375943e-07, + "loss": 2.5852, + "step": 9617000 + }, + { + "epoch": 2.989739915165061, + "grad_norm": 8.992892265319824, + "learning_rate": 1.7100141391564484e-07, + "loss": 2.546, + "step": 9617500 + }, + { + "epoch": 2.989895347445548, + "grad_norm": 10.992084503173828, + "learning_rate": 1.6841087590753025e-07, + "loss": 2.5248, + "step": 9618000 + }, + { + "epoch": 2.990050779726035, + "grad_norm": 12.0274658203125, + "learning_rate": 1.6582033789941563e-07, + "loss": 2.5816, + "step": 9618500 + }, + { + "epoch": 2.990206212006522, + "grad_norm": 10.461660385131836, + "learning_rate": 1.6322979989130103e-07, + "loss": 2.5828, + "step": 9619000 + }, + { + "epoch": 2.9903616442870087, + "grad_norm": 9.81654167175293, + "learning_rate": 1.6063926188318644e-07, + "loss": 2.5544, + "step": 9619500 + }, + { + "epoch": 2.9905170765674955, + "grad_norm": 11.132402420043945, + "learning_rate": 1.5804872387507182e-07, + "loss": 2.5702, + "step": 9620000 + }, + { + "epoch": 2.9906725088479824, + "grad_norm": 10.19917106628418, + "learning_rate": 1.5545818586695722e-07, + "loss": 2.5402, + "step": 9620500 + }, + { + "epoch": 2.9908279411284693, + "grad_norm": 10.556532859802246, + "learning_rate": 1.5286764785884263e-07, + "loss": 2.504, + "step": 9621000 + }, + { + "epoch": 2.990983373408956, + "grad_norm": 9.97053337097168, + "learning_rate": 1.5027710985072804e-07, + "loss": 2.528, + "step": 9621500 + }, + { + "epoch": 2.9911388056894435, + "grad_norm": 10.854775428771973, + "learning_rate": 1.4768657184261342e-07, + "loss": 2.5101, + "step": 9622000 + }, + { + "epoch": 2.99129423796993, + "grad_norm": 13.410733222961426, + "learning_rate": 1.4509603383449882e-07, + "loss": 2.5339, + "step": 9622500 + }, + { + "epoch": 2.991449670250417, + "grad_norm": 10.090474128723145, + "learning_rate": 1.4250549582638423e-07, + "loss": 2.5292, + "step": 9623000 + }, + { + "epoch": 2.9916051025309036, + "grad_norm": 12.232257843017578, + "learning_rate": 1.399149578182696e-07, + "loss": 2.5598, + "step": 9623500 + }, + { + "epoch": 2.991760534811391, + "grad_norm": 10.715484619140625, + "learning_rate": 1.3732441981015502e-07, + "loss": 2.4963, + "step": 9624000 + }, + { + "epoch": 2.9919159670918773, + "grad_norm": 39.777896881103516, + "learning_rate": 1.347338818020404e-07, + "loss": 2.5194, + "step": 9624500 + }, + { + "epoch": 2.9920713993723647, + "grad_norm": 10.542218208312988, + "learning_rate": 1.3214334379392583e-07, + "loss": 2.5159, + "step": 9625000 + }, + { + "epoch": 2.992226831652851, + "grad_norm": 8.898301124572754, + "learning_rate": 1.2955280578581124e-07, + "loss": 2.5613, + "step": 9625500 + }, + { + "epoch": 2.9923822639333384, + "grad_norm": 12.443818092346191, + "learning_rate": 1.2696226777769661e-07, + "loss": 2.5308, + "step": 9626000 + }, + { + "epoch": 2.992537696213825, + "grad_norm": 9.92770004272461, + "learning_rate": 1.2437172976958202e-07, + "loss": 2.5233, + "step": 9626500 + }, + { + "epoch": 2.992693128494312, + "grad_norm": 11.819747924804688, + "learning_rate": 1.217811917614674e-07, + "loss": 2.5617, + "step": 9627000 + }, + { + "epoch": 2.992848560774799, + "grad_norm": 10.386639595031738, + "learning_rate": 1.1919065375335281e-07, + "loss": 2.5071, + "step": 9627500 + }, + { + "epoch": 2.993003993055286, + "grad_norm": 11.884387969970703, + "learning_rate": 1.166001157452382e-07, + "loss": 2.5822, + "step": 9628000 + }, + { + "epoch": 2.9931594253357727, + "grad_norm": 9.681358337402344, + "learning_rate": 1.1400957773712361e-07, + "loss": 2.5223, + "step": 9628500 + }, + { + "epoch": 2.9933148576162596, + "grad_norm": 10.392304420471191, + "learning_rate": 1.1141903972900901e-07, + "loss": 2.5182, + "step": 9629000 + }, + { + "epoch": 2.9934702898967465, + "grad_norm": 13.719117164611816, + "learning_rate": 1.0882850172089439e-07, + "loss": 2.5162, + "step": 9629500 + }, + { + "epoch": 2.9936257221772333, + "grad_norm": 10.676910400390625, + "learning_rate": 1.0623796371277981e-07, + "loss": 2.6007, + "step": 9630000 + }, + { + "epoch": 2.99378115445772, + "grad_norm": 10.641459465026855, + "learning_rate": 1.0364742570466519e-07, + "loss": 2.5607, + "step": 9630500 + }, + { + "epoch": 2.993936586738207, + "grad_norm": 10.007463455200195, + "learning_rate": 1.010568876965506e-07, + "loss": 2.5597, + "step": 9631000 + }, + { + "epoch": 2.994092019018694, + "grad_norm": 9.638599395751953, + "learning_rate": 9.8466349688436e-08, + "loss": 2.4992, + "step": 9631500 + }, + { + "epoch": 2.994247451299181, + "grad_norm": 10.100459098815918, + "learning_rate": 9.58758116803214e-08, + "loss": 2.5436, + "step": 9632000 + }, + { + "epoch": 2.9944028835796677, + "grad_norm": 19.149076461791992, + "learning_rate": 9.328527367220679e-08, + "loss": 2.5312, + "step": 9632500 + }, + { + "epoch": 2.9945583158601545, + "grad_norm": 10.84813404083252, + "learning_rate": 9.06947356640922e-08, + "loss": 2.5068, + "step": 9633000 + }, + { + "epoch": 2.9947137481406414, + "grad_norm": 10.606138229370117, + "learning_rate": 8.810419765597759e-08, + "loss": 2.5284, + "step": 9633500 + }, + { + "epoch": 2.9948691804211283, + "grad_norm": 10.711310386657715, + "learning_rate": 8.5513659647863e-08, + "loss": 2.5278, + "step": 9634000 + }, + { + "epoch": 2.995024612701615, + "grad_norm": 10.198830604553223, + "learning_rate": 8.292312163974839e-08, + "loss": 2.5332, + "step": 9634500 + }, + { + "epoch": 2.995180044982102, + "grad_norm": 17.587024688720703, + "learning_rate": 8.033258363163378e-08, + "loss": 2.5383, + "step": 9635000 + }, + { + "epoch": 2.995335477262589, + "grad_norm": 8.508935928344727, + "learning_rate": 7.774204562351918e-08, + "loss": 2.5757, + "step": 9635500 + }, + { + "epoch": 2.9954909095430757, + "grad_norm": 10.019346237182617, + "learning_rate": 7.51515076154046e-08, + "loss": 2.5278, + "step": 9636000 + }, + { + "epoch": 2.9956463418235626, + "grad_norm": 10.553516387939453, + "learning_rate": 7.256096960728999e-08, + "loss": 2.5605, + "step": 9636500 + }, + { + "epoch": 2.9958017741040495, + "grad_norm": 9.779068946838379, + "learning_rate": 6.997043159917538e-08, + "loss": 2.5705, + "step": 9637000 + }, + { + "epoch": 2.9959572063845363, + "grad_norm": 10.442828178405762, + "learning_rate": 6.737989359106078e-08, + "loss": 2.5451, + "step": 9637500 + }, + { + "epoch": 2.996112638665023, + "grad_norm": 8.23055362701416, + "learning_rate": 6.478935558294618e-08, + "loss": 2.5102, + "step": 9638000 + }, + { + "epoch": 2.99626807094551, + "grad_norm": 10.194281578063965, + "learning_rate": 6.219881757483158e-08, + "loss": 2.527, + "step": 9638500 + }, + { + "epoch": 2.996423503225997, + "grad_norm": 11.066518783569336, + "learning_rate": 5.960827956671698e-08, + "loss": 2.5325, + "step": 9639000 + }, + { + "epoch": 2.996578935506484, + "grad_norm": 13.751860618591309, + "learning_rate": 5.701774155860238e-08, + "loss": 2.5867, + "step": 9639500 + }, + { + "epoch": 2.9967343677869707, + "grad_norm": 11.067245483398438, + "learning_rate": 5.4427203550487775e-08, + "loss": 2.5797, + "step": 9640000 + }, + { + "epoch": 2.9968898000674575, + "grad_norm": 9.238104820251465, + "learning_rate": 5.1836665542373174e-08, + "loss": 2.5305, + "step": 9640500 + }, + { + "epoch": 2.9970452323479444, + "grad_norm": 9.684460639953613, + "learning_rate": 4.924612753425857e-08, + "loss": 2.4887, + "step": 9641000 + }, + { + "epoch": 2.9972006646284313, + "grad_norm": 9.892979621887207, + "learning_rate": 4.6655589526143974e-08, + "loss": 2.5349, + "step": 9641500 + }, + { + "epoch": 2.997356096908918, + "grad_norm": 16.026405334472656, + "learning_rate": 4.406505151802937e-08, + "loss": 2.5294, + "step": 9642000 + }, + { + "epoch": 2.997511529189405, + "grad_norm": 9.597797393798828, + "learning_rate": 4.147451350991477e-08, + "loss": 2.5035, + "step": 9642500 + }, + { + "epoch": 2.997666961469892, + "grad_norm": 9.39564323425293, + "learning_rate": 3.8883975501800166e-08, + "loss": 2.5646, + "step": 9643000 + }, + { + "epoch": 2.9978223937503787, + "grad_norm": 10.482442855834961, + "learning_rate": 3.6293437493685566e-08, + "loss": 2.5219, + "step": 9643500 + }, + { + "epoch": 2.9979778260308656, + "grad_norm": 10.84255313873291, + "learning_rate": 3.370289948557096e-08, + "loss": 2.5709, + "step": 9644000 + }, + { + "epoch": 2.9981332583113525, + "grad_norm": 12.781850814819336, + "learning_rate": 3.1112361477456366e-08, + "loss": 2.5583, + "step": 9644500 + }, + { + "epoch": 2.9982886905918393, + "grad_norm": 16.42194175720215, + "learning_rate": 2.8521823469341762e-08, + "loss": 2.5218, + "step": 9645000 + }, + { + "epoch": 2.9984441228723266, + "grad_norm": 8.03348445892334, + "learning_rate": 2.593128546122716e-08, + "loss": 2.5243, + "step": 9645500 + }, + { + "epoch": 2.998599555152813, + "grad_norm": 10.925944328308105, + "learning_rate": 2.3340747453112558e-08, + "loss": 2.5787, + "step": 9646000 + }, + { + "epoch": 2.9987549874333004, + "grad_norm": 16.753633499145508, + "learning_rate": 2.0750209444997958e-08, + "loss": 2.5168, + "step": 9646500 + }, + { + "epoch": 2.998910419713787, + "grad_norm": 11.790155410766602, + "learning_rate": 1.8159671436883354e-08, + "loss": 2.5359, + "step": 9647000 + }, + { + "epoch": 2.999065851994274, + "grad_norm": 9.199599266052246, + "learning_rate": 1.5569133428768754e-08, + "loss": 2.5865, + "step": 9647500 + }, + { + "epoch": 2.9992212842747605, + "grad_norm": 15.874154090881348, + "learning_rate": 1.2978595420654152e-08, + "loss": 2.5687, + "step": 9648000 + }, + { + "epoch": 2.999376716555248, + "grad_norm": 9.384015083312988, + "learning_rate": 1.0388057412539552e-08, + "loss": 2.5662, + "step": 9648500 + }, + { + "epoch": 2.9995321488357343, + "grad_norm": 10.502700805664062, + "learning_rate": 7.79751940442495e-09, + "loss": 2.5503, + "step": 9649000 + }, + { + "epoch": 2.9996875811162216, + "grad_norm": 11.928768157958984, + "learning_rate": 5.206981396310349e-09, + "loss": 2.6018, + "step": 9649500 + }, + { + "epoch": 2.999843013396708, + "grad_norm": 10.027670860290527, + "learning_rate": 2.6164433881957476e-09, + "loss": 2.5846, + "step": 9650000 + }, + { + "epoch": 2.9999984456771953, + "grad_norm": 14.092852592468262, + "learning_rate": 2.590538008114601e-11, + "loss": 2.5132, + "step": 9650500 + } + ], + "logging_steps": 500, + "max_steps": 9650505, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0763853642848662e+19, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}