{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3033, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009891196834817012, "grad_norm": 47.47864436307981, "learning_rate": 1.6447368421052632e-07, "loss": 11.4052, "step": 1 }, { "epoch": 0.0019782393669634025, "grad_norm": 45.91292076188422, "learning_rate": 3.2894736842105264e-07, "loss": 11.2815, "step": 2 }, { "epoch": 0.002967359050445104, "grad_norm": 47.726852516408236, "learning_rate": 4.934210526315789e-07, "loss": 11.3935, "step": 3 }, { "epoch": 0.003956478733926805, "grad_norm": 45.10574020271687, "learning_rate": 6.578947368421053e-07, "loss": 11.5897, "step": 4 }, { "epoch": 0.004945598417408506, "grad_norm": 50.8689323614659, "learning_rate": 8.223684210526316e-07, "loss": 11.2181, "step": 5 }, { "epoch": 0.005934718100890208, "grad_norm": 44.83636067584137, "learning_rate": 9.868421052631579e-07, "loss": 11.543, "step": 6 }, { "epoch": 0.006923837784371909, "grad_norm": 44.619692790934934, "learning_rate": 1.1513157894736842e-06, "loss": 11.364, "step": 7 }, { "epoch": 0.00791295746785361, "grad_norm": 46.17981710645455, "learning_rate": 1.3157894736842106e-06, "loss": 11.372, "step": 8 }, { "epoch": 0.008902077151335312, "grad_norm": 49.73917366470631, "learning_rate": 1.480263157894737e-06, "loss": 11.0869, "step": 9 }, { "epoch": 0.009891196834817012, "grad_norm": 52.40342778684296, "learning_rate": 1.6447368421052632e-06, "loss": 11.054, "step": 10 }, { "epoch": 0.010880316518298714, "grad_norm": 48.69950321668828, "learning_rate": 1.8092105263157896e-06, "loss": 11.0688, "step": 11 }, { "epoch": 0.011869436201780416, "grad_norm": 63.65227677037272, "learning_rate": 1.9736842105263157e-06, "loss": 10.3072, "step": 12 }, { "epoch": 0.012858555885262116, "grad_norm": 79.55368321861843, "learning_rate": 2.138157894736842e-06, "loss": 9.7245, "step": 13 }, { "epoch": 0.013847675568743818, "grad_norm": 82.8103382216083, "learning_rate": 2.3026315789473684e-06, "loss": 9.4617, "step": 14 }, { "epoch": 0.01483679525222552, "grad_norm": 84.84620253448091, "learning_rate": 2.4671052631578948e-06, "loss": 9.3283, "step": 15 }, { "epoch": 0.01582591493570722, "grad_norm": 72.7809868244949, "learning_rate": 2.631578947368421e-06, "loss": 4.3421, "step": 16 }, { "epoch": 0.016815034619188922, "grad_norm": 68.45135162785618, "learning_rate": 2.7960526315789475e-06, "loss": 4.768, "step": 17 }, { "epoch": 0.017804154302670624, "grad_norm": 65.03039569474328, "learning_rate": 2.960526315789474e-06, "loss": 4.418, "step": 18 }, { "epoch": 0.018793273986152326, "grad_norm": 45.947045347522504, "learning_rate": 3.125e-06, "loss": 3.3595, "step": 19 }, { "epoch": 0.019782393669634024, "grad_norm": 43.95974652547691, "learning_rate": 3.2894736842105265e-06, "loss": 3.1867, "step": 20 }, { "epoch": 0.020771513353115726, "grad_norm": 9.308745952351916, "learning_rate": 3.4539473684210533e-06, "loss": 1.9175, "step": 21 }, { "epoch": 0.021760633036597428, "grad_norm": 5.755918880001644, "learning_rate": 3.618421052631579e-06, "loss": 1.536, "step": 22 }, { "epoch": 0.02274975272007913, "grad_norm": 4.930173474585826, "learning_rate": 3.7828947368421055e-06, "loss": 1.4667, "step": 23 }, { "epoch": 0.02373887240356083, "grad_norm": 4.28733293354817, "learning_rate": 3.9473684210526315e-06, "loss": 1.4006, "step": 24 }, { "epoch": 0.024727992087042534, "grad_norm": 3.829716127893356, "learning_rate": 4.111842105263159e-06, "loss": 1.532, "step": 25 }, { "epoch": 0.025717111770524232, "grad_norm": 3.0324967863470813, "learning_rate": 4.276315789473684e-06, "loss": 1.4976, "step": 26 }, { "epoch": 0.026706231454005934, "grad_norm": 2.5586232440965344, "learning_rate": 4.4407894736842105e-06, "loss": 1.3703, "step": 27 }, { "epoch": 0.027695351137487636, "grad_norm": 1.9156066610224773, "learning_rate": 4.605263157894737e-06, "loss": 1.1345, "step": 28 }, { "epoch": 0.028684470820969338, "grad_norm": 1.74939443115538, "learning_rate": 4.769736842105264e-06, "loss": 1.2478, "step": 29 }, { "epoch": 0.02967359050445104, "grad_norm": 1.5831108352532686, "learning_rate": 4.9342105263157895e-06, "loss": 1.1836, "step": 30 }, { "epoch": 0.03066271018793274, "grad_norm": 2.2356207014234997, "learning_rate": 5.098684210526316e-06, "loss": 1.3763, "step": 31 }, { "epoch": 0.03165182987141444, "grad_norm": 1.1026657175125092, "learning_rate": 5.263157894736842e-06, "loss": 1.0571, "step": 32 }, { "epoch": 0.032640949554896145, "grad_norm": 1.0672135675760805, "learning_rate": 5.4276315789473686e-06, "loss": 1.1693, "step": 33 }, { "epoch": 0.033630069238377844, "grad_norm": 0.9532916815841181, "learning_rate": 5.592105263157895e-06, "loss": 0.9262, "step": 34 }, { "epoch": 0.03461918892185954, "grad_norm": 0.8351774057987832, "learning_rate": 5.756578947368421e-06, "loss": 0.9843, "step": 35 }, { "epoch": 0.03560830860534125, "grad_norm": 0.7152560683189684, "learning_rate": 5.921052631578948e-06, "loss": 0.8934, "step": 36 }, { "epoch": 0.036597428288822946, "grad_norm": 0.8605762164117818, "learning_rate": 6.085526315789474e-06, "loss": 1.0897, "step": 37 }, { "epoch": 0.03758654797230465, "grad_norm": 0.8760202419017974, "learning_rate": 6.25e-06, "loss": 1.1443, "step": 38 }, { "epoch": 0.03857566765578635, "grad_norm": 0.6624437855685319, "learning_rate": 6.4144736842105275e-06, "loss": 0.8699, "step": 39 }, { "epoch": 0.03956478733926805, "grad_norm": 0.7049615396906419, "learning_rate": 6.578947368421053e-06, "loss": 0.9151, "step": 40 }, { "epoch": 0.040553907022749754, "grad_norm": 0.7854506183295861, "learning_rate": 6.743421052631579e-06, "loss": 0.9542, "step": 41 }, { "epoch": 0.04154302670623145, "grad_norm": 0.6223807683273548, "learning_rate": 6.9078947368421065e-06, "loss": 0.9371, "step": 42 }, { "epoch": 0.04253214638971316, "grad_norm": 0.6248927306368205, "learning_rate": 7.072368421052632e-06, "loss": 0.9072, "step": 43 }, { "epoch": 0.043521266073194856, "grad_norm": 0.647479183993099, "learning_rate": 7.236842105263158e-06, "loss": 0.9054, "step": 44 }, { "epoch": 0.04451038575667656, "grad_norm": 0.5889797054704718, "learning_rate": 7.401315789473684e-06, "loss": 1.0402, "step": 45 }, { "epoch": 0.04549950544015826, "grad_norm": 0.4967497474491158, "learning_rate": 7.565789473684211e-06, "loss": 0.9069, "step": 46 }, { "epoch": 0.04648862512363996, "grad_norm": 0.5661932222672871, "learning_rate": 7.730263157894737e-06, "loss": 0.9583, "step": 47 }, { "epoch": 0.04747774480712166, "grad_norm": 6.156382243862993, "learning_rate": 7.894736842105263e-06, "loss": 0.977, "step": 48 }, { "epoch": 0.04846686449060336, "grad_norm": 0.6197962682604914, "learning_rate": 8.05921052631579e-06, "loss": 0.7968, "step": 49 }, { "epoch": 0.04945598417408507, "grad_norm": 0.5935105377105369, "learning_rate": 8.223684210526317e-06, "loss": 0.8727, "step": 50 }, { "epoch": 0.050445103857566766, "grad_norm": 0.5139477978028617, "learning_rate": 8.388157894736843e-06, "loss": 0.9035, "step": 51 }, { "epoch": 0.051434223541048464, "grad_norm": 0.48831574166447717, "learning_rate": 8.552631578947368e-06, "loss": 0.7875, "step": 52 }, { "epoch": 0.05242334322453017, "grad_norm": 0.5106264570172955, "learning_rate": 8.717105263157894e-06, "loss": 0.9089, "step": 53 }, { "epoch": 0.05341246290801187, "grad_norm": 0.48126592930194784, "learning_rate": 8.881578947368421e-06, "loss": 0.7222, "step": 54 }, { "epoch": 0.05440158259149357, "grad_norm": 0.3816418172871715, "learning_rate": 9.046052631578948e-06, "loss": 0.7726, "step": 55 }, { "epoch": 0.05539070227497527, "grad_norm": 0.3805932483187457, "learning_rate": 9.210526315789474e-06, "loss": 0.7265, "step": 56 }, { "epoch": 0.05637982195845697, "grad_norm": 0.35478918543818216, "learning_rate": 9.375000000000001e-06, "loss": 0.7009, "step": 57 }, { "epoch": 0.057368941641938676, "grad_norm": 0.34092363071107723, "learning_rate": 9.539473684210528e-06, "loss": 0.7749, "step": 58 }, { "epoch": 0.058358061325420374, "grad_norm": 0.36162518512491465, "learning_rate": 9.703947368421054e-06, "loss": 0.8226, "step": 59 }, { "epoch": 0.05934718100890208, "grad_norm": 0.40209857091787987, "learning_rate": 9.868421052631579e-06, "loss": 0.836, "step": 60 }, { "epoch": 0.06033630069238378, "grad_norm": 0.3549385133528555, "learning_rate": 1.0032894736842106e-05, "loss": 0.7356, "step": 61 }, { "epoch": 0.06132542037586548, "grad_norm": 0.3578780388034424, "learning_rate": 1.0197368421052632e-05, "loss": 0.8599, "step": 62 }, { "epoch": 0.06231454005934718, "grad_norm": 0.3669356895971538, "learning_rate": 1.0361842105263159e-05, "loss": 0.8198, "step": 63 }, { "epoch": 0.06330365974282888, "grad_norm": 0.3543644351832402, "learning_rate": 1.0526315789473684e-05, "loss": 0.8604, "step": 64 }, { "epoch": 0.06429277942631058, "grad_norm": 0.3713938433998872, "learning_rate": 1.0690789473684212e-05, "loss": 0.8382, "step": 65 }, { "epoch": 0.06528189910979229, "grad_norm": 0.3340439100059536, "learning_rate": 1.0855263157894737e-05, "loss": 0.7577, "step": 66 }, { "epoch": 0.06627101879327399, "grad_norm": 0.31244239116416433, "learning_rate": 1.1019736842105263e-05, "loss": 0.714, "step": 67 }, { "epoch": 0.06726013847675569, "grad_norm": 0.36393336511648067, "learning_rate": 1.118421052631579e-05, "loss": 0.8557, "step": 68 }, { "epoch": 0.06824925816023739, "grad_norm": 0.39874922213238695, "learning_rate": 1.1348684210526317e-05, "loss": 0.7143, "step": 69 }, { "epoch": 0.06923837784371908, "grad_norm": 0.3354909696602849, "learning_rate": 1.1513157894736843e-05, "loss": 0.8087, "step": 70 }, { "epoch": 0.0702274975272008, "grad_norm": 0.3195954665583774, "learning_rate": 1.167763157894737e-05, "loss": 0.7187, "step": 71 }, { "epoch": 0.0712166172106825, "grad_norm": 0.3134655600001495, "learning_rate": 1.1842105263157895e-05, "loss": 0.8406, "step": 72 }, { "epoch": 0.0722057368941642, "grad_norm": 0.3026614417415018, "learning_rate": 1.200657894736842e-05, "loss": 0.8183, "step": 73 }, { "epoch": 0.07319485657764589, "grad_norm": 0.3216818743924892, "learning_rate": 1.2171052631578948e-05, "loss": 0.849, "step": 74 }, { "epoch": 0.07418397626112759, "grad_norm": 0.3667927953864343, "learning_rate": 1.2335526315789473e-05, "loss": 0.8876, "step": 75 }, { "epoch": 0.0751730959446093, "grad_norm": 0.31846281371038865, "learning_rate": 1.25e-05, "loss": 0.7137, "step": 76 }, { "epoch": 0.076162215628091, "grad_norm": 0.3649775475138143, "learning_rate": 1.2664473684210526e-05, "loss": 0.9686, "step": 77 }, { "epoch": 0.0771513353115727, "grad_norm": 0.3552395073907018, "learning_rate": 1.2828947368421055e-05, "loss": 0.9255, "step": 78 }, { "epoch": 0.0781404549950544, "grad_norm": 0.3362980375908929, "learning_rate": 1.299342105263158e-05, "loss": 0.7663, "step": 79 }, { "epoch": 0.0791295746785361, "grad_norm": 0.3332486864873228, "learning_rate": 1.3157894736842106e-05, "loss": 0.8464, "step": 80 }, { "epoch": 0.08011869436201781, "grad_norm": 0.3563264366987691, "learning_rate": 1.3322368421052633e-05, "loss": 0.7251, "step": 81 }, { "epoch": 0.08110781404549951, "grad_norm": 0.32718462995919856, "learning_rate": 1.3486842105263159e-05, "loss": 0.8337, "step": 82 }, { "epoch": 0.0820969337289812, "grad_norm": 0.3098690267315848, "learning_rate": 1.3651315789473684e-05, "loss": 0.7089, "step": 83 }, { "epoch": 0.0830860534124629, "grad_norm": 0.33231060025498604, "learning_rate": 1.3815789473684213e-05, "loss": 0.6336, "step": 84 }, { "epoch": 0.0840751730959446, "grad_norm": 0.28062844248423013, "learning_rate": 1.3980263157894739e-05, "loss": 0.6994, "step": 85 }, { "epoch": 0.08506429277942631, "grad_norm": 0.29411686322877856, "learning_rate": 1.4144736842105264e-05, "loss": 0.6726, "step": 86 }, { "epoch": 0.08605341246290801, "grad_norm": 0.34261251457783337, "learning_rate": 1.430921052631579e-05, "loss": 0.8291, "step": 87 }, { "epoch": 0.08704253214638971, "grad_norm": 0.36178717911541053, "learning_rate": 1.4473684210526317e-05, "loss": 0.8385, "step": 88 }, { "epoch": 0.08803165182987141, "grad_norm": 0.3461353549466678, "learning_rate": 1.4638157894736842e-05, "loss": 0.7122, "step": 89 }, { "epoch": 0.08902077151335312, "grad_norm": 0.35500836343925757, "learning_rate": 1.4802631578947368e-05, "loss": 0.7191, "step": 90 }, { "epoch": 0.09000989119683482, "grad_norm": 0.3361253921537601, "learning_rate": 1.4967105263157897e-05, "loss": 0.8086, "step": 91 }, { "epoch": 0.09099901088031652, "grad_norm": 0.33504758656742495, "learning_rate": 1.5131578947368422e-05, "loss": 0.7375, "step": 92 }, { "epoch": 0.09198813056379822, "grad_norm": 0.3334568733991201, "learning_rate": 1.5296052631578946e-05, "loss": 0.7225, "step": 93 }, { "epoch": 0.09297725024727992, "grad_norm": 0.3350306861891048, "learning_rate": 1.5460526315789475e-05, "loss": 0.7998, "step": 94 }, { "epoch": 0.09396636993076163, "grad_norm": 0.38462200040273103, "learning_rate": 1.5625e-05, "loss": 0.8569, "step": 95 }, { "epoch": 0.09495548961424333, "grad_norm": 0.34279551157272875, "learning_rate": 1.5789473684210526e-05, "loss": 0.6453, "step": 96 }, { "epoch": 0.09594460929772503, "grad_norm": 0.33073887008527225, "learning_rate": 1.5953947368421055e-05, "loss": 0.7839, "step": 97 }, { "epoch": 0.09693372898120672, "grad_norm": 0.2931675479755555, "learning_rate": 1.611842105263158e-05, "loss": 0.7174, "step": 98 }, { "epoch": 0.09792284866468842, "grad_norm": 0.3169582507658582, "learning_rate": 1.6282894736842106e-05, "loss": 0.7605, "step": 99 }, { "epoch": 0.09891196834817013, "grad_norm": 0.42763487156637653, "learning_rate": 1.6447368421052635e-05, "loss": 0.7602, "step": 100 }, { "epoch": 0.09990108803165183, "grad_norm": 0.29074916324227895, "learning_rate": 1.661184210526316e-05, "loss": 0.6866, "step": 101 }, { "epoch": 0.10089020771513353, "grad_norm": 0.352348279384898, "learning_rate": 1.6776315789473686e-05, "loss": 0.8076, "step": 102 }, { "epoch": 0.10187932739861523, "grad_norm": 0.3273893679465929, "learning_rate": 1.694078947368421e-05, "loss": 0.7077, "step": 103 }, { "epoch": 0.10286844708209693, "grad_norm": 0.3571307875845898, "learning_rate": 1.7105263157894737e-05, "loss": 0.7014, "step": 104 }, { "epoch": 0.10385756676557864, "grad_norm": 0.33000827078733097, "learning_rate": 1.7269736842105262e-05, "loss": 0.6654, "step": 105 }, { "epoch": 0.10484668644906034, "grad_norm": 0.4595448328039601, "learning_rate": 1.7434210526315788e-05, "loss": 0.7122, "step": 106 }, { "epoch": 0.10583580613254204, "grad_norm": 0.35389814780232176, "learning_rate": 1.7598684210526316e-05, "loss": 0.6255, "step": 107 }, { "epoch": 0.10682492581602374, "grad_norm": 0.35214588487585297, "learning_rate": 1.7763157894736842e-05, "loss": 0.6463, "step": 108 }, { "epoch": 0.10781404549950543, "grad_norm": 0.37496257339354294, "learning_rate": 1.7927631578947367e-05, "loss": 0.6374, "step": 109 }, { "epoch": 0.10880316518298715, "grad_norm": 0.3901229505469495, "learning_rate": 1.8092105263157896e-05, "loss": 0.8409, "step": 110 }, { "epoch": 0.10979228486646884, "grad_norm": 0.36503555388396497, "learning_rate": 1.8256578947368422e-05, "loss": 0.7027, "step": 111 }, { "epoch": 0.11078140454995054, "grad_norm": 0.39058659212806174, "learning_rate": 1.8421052631578947e-05, "loss": 0.666, "step": 112 }, { "epoch": 0.11177052423343224, "grad_norm": 0.3393334155039262, "learning_rate": 1.8585526315789476e-05, "loss": 0.6489, "step": 113 }, { "epoch": 0.11275964391691394, "grad_norm": 0.3641878610374112, "learning_rate": 1.8750000000000002e-05, "loss": 0.5997, "step": 114 }, { "epoch": 0.11374876360039565, "grad_norm": 0.35954765537088884, "learning_rate": 1.8914473684210527e-05, "loss": 0.6643, "step": 115 }, { "epoch": 0.11473788328387735, "grad_norm": 0.332353400970844, "learning_rate": 1.9078947368421056e-05, "loss": 0.6558, "step": 116 }, { "epoch": 0.11572700296735905, "grad_norm": 0.3658649526637821, "learning_rate": 1.924342105263158e-05, "loss": 0.748, "step": 117 }, { "epoch": 0.11671612265084075, "grad_norm": 0.4110068913472326, "learning_rate": 1.9407894736842107e-05, "loss": 0.7679, "step": 118 }, { "epoch": 0.11770524233432245, "grad_norm": 0.312188988598259, "learning_rate": 1.9572368421052633e-05, "loss": 0.7806, "step": 119 }, { "epoch": 0.11869436201780416, "grad_norm": 0.39415290875961995, "learning_rate": 1.9736842105263158e-05, "loss": 0.6902, "step": 120 }, { "epoch": 0.11968348170128586, "grad_norm": 0.30258592751919755, "learning_rate": 1.9901315789473684e-05, "loss": 0.6828, "step": 121 }, { "epoch": 0.12067260138476756, "grad_norm": 0.32157745711913, "learning_rate": 2.0065789473684213e-05, "loss": 0.7062, "step": 122 }, { "epoch": 0.12166172106824925, "grad_norm": 0.38984990793746416, "learning_rate": 2.0230263157894738e-05, "loss": 0.7383, "step": 123 }, { "epoch": 0.12265084075173097, "grad_norm": 0.4111544139628007, "learning_rate": 2.0394736842105264e-05, "loss": 0.6269, "step": 124 }, { "epoch": 0.12363996043521266, "grad_norm": 0.39448247544238374, "learning_rate": 2.055921052631579e-05, "loss": 0.7416, "step": 125 }, { "epoch": 0.12462908011869436, "grad_norm": 0.3444637293280687, "learning_rate": 2.0723684210526318e-05, "loss": 0.6733, "step": 126 }, { "epoch": 0.12561819980217606, "grad_norm": 0.4001985495233771, "learning_rate": 2.0888157894736843e-05, "loss": 0.6515, "step": 127 }, { "epoch": 0.12660731948565776, "grad_norm": 0.3794186330915447, "learning_rate": 2.105263157894737e-05, "loss": 0.662, "step": 128 }, { "epoch": 0.12759643916913946, "grad_norm": 0.38824093039999913, "learning_rate": 2.1217105263157898e-05, "loss": 0.7238, "step": 129 }, { "epoch": 0.12858555885262116, "grad_norm": 0.4556524964885303, "learning_rate": 2.1381578947368423e-05, "loss": 0.6779, "step": 130 }, { "epoch": 0.12957467853610286, "grad_norm": 0.3643359091192952, "learning_rate": 2.154605263157895e-05, "loss": 0.673, "step": 131 }, { "epoch": 0.13056379821958458, "grad_norm": 0.491430142042037, "learning_rate": 2.1710526315789474e-05, "loss": 0.6031, "step": 132 }, { "epoch": 0.13155291790306628, "grad_norm": 0.3644642242254432, "learning_rate": 2.1875e-05, "loss": 0.6054, "step": 133 }, { "epoch": 0.13254203758654798, "grad_norm": 0.4391260070830592, "learning_rate": 2.2039473684210525e-05, "loss": 0.569, "step": 134 }, { "epoch": 0.13353115727002968, "grad_norm": 0.3275145794011892, "learning_rate": 2.2203947368421054e-05, "loss": 0.6724, "step": 135 }, { "epoch": 0.13452027695351138, "grad_norm": 0.4672558471628718, "learning_rate": 2.236842105263158e-05, "loss": 0.6973, "step": 136 }, { "epoch": 0.13550939663699307, "grad_norm": 0.37484287127842214, "learning_rate": 2.2532894736842105e-05, "loss": 0.6024, "step": 137 }, { "epoch": 0.13649851632047477, "grad_norm": 0.3741616001117786, "learning_rate": 2.2697368421052634e-05, "loss": 0.6354, "step": 138 }, { "epoch": 0.13748763600395647, "grad_norm": 0.40705218782451663, "learning_rate": 2.286184210526316e-05, "loss": 0.7532, "step": 139 }, { "epoch": 0.13847675568743817, "grad_norm": 0.3677599289459387, "learning_rate": 2.3026315789473685e-05, "loss": 0.7191, "step": 140 }, { "epoch": 0.1394658753709199, "grad_norm": 0.3999859660311767, "learning_rate": 2.3190789473684214e-05, "loss": 0.6836, "step": 141 }, { "epoch": 0.1404549950544016, "grad_norm": 0.38230856108223893, "learning_rate": 2.335526315789474e-05, "loss": 0.6404, "step": 142 }, { "epoch": 0.1414441147378833, "grad_norm": 0.3628560484729775, "learning_rate": 2.3519736842105265e-05, "loss": 0.654, "step": 143 }, { "epoch": 0.142433234421365, "grad_norm": 0.3448007883826662, "learning_rate": 2.368421052631579e-05, "loss": 0.6698, "step": 144 }, { "epoch": 0.1434223541048467, "grad_norm": 0.5552741489616734, "learning_rate": 2.3848684210526316e-05, "loss": 0.7492, "step": 145 }, { "epoch": 0.1444114737883284, "grad_norm": 0.3342617236256358, "learning_rate": 2.401315789473684e-05, "loss": 0.5378, "step": 146 }, { "epoch": 0.14540059347181009, "grad_norm": 0.3994524276771364, "learning_rate": 2.4177631578947367e-05, "loss": 0.5827, "step": 147 }, { "epoch": 0.14638971315529178, "grad_norm": 0.40544863606069553, "learning_rate": 2.4342105263157896e-05, "loss": 0.6543, "step": 148 }, { "epoch": 0.14737883283877348, "grad_norm": 0.36633930457542324, "learning_rate": 2.450657894736842e-05, "loss": 0.6954, "step": 149 }, { "epoch": 0.14836795252225518, "grad_norm": 0.36860483728084853, "learning_rate": 2.4671052631578947e-05, "loss": 0.6117, "step": 150 }, { "epoch": 0.1493570722057369, "grad_norm": 0.40299273023116766, "learning_rate": 2.4835526315789476e-05, "loss": 0.6677, "step": 151 }, { "epoch": 0.1503461918892186, "grad_norm": 0.46292039106861615, "learning_rate": 2.5e-05, "loss": 0.7554, "step": 152 }, { "epoch": 0.1513353115727003, "grad_norm": 0.3858429666089543, "learning_rate": 2.5164473684210527e-05, "loss": 0.6302, "step": 153 }, { "epoch": 0.152324431256182, "grad_norm": 0.5074668120955147, "learning_rate": 2.5328947368421052e-05, "loss": 0.6442, "step": 154 }, { "epoch": 0.1533135509396637, "grad_norm": 0.37588795715965484, "learning_rate": 2.5493421052631578e-05, "loss": 0.6647, "step": 155 }, { "epoch": 0.1543026706231454, "grad_norm": 0.563121630811762, "learning_rate": 2.565789473684211e-05, "loss": 0.6455, "step": 156 }, { "epoch": 0.1552917903066271, "grad_norm": 0.48379869834031464, "learning_rate": 2.5822368421052635e-05, "loss": 0.723, "step": 157 }, { "epoch": 0.1562809099901088, "grad_norm": 0.3916557190918107, "learning_rate": 2.598684210526316e-05, "loss": 0.5728, "step": 158 }, { "epoch": 0.1572700296735905, "grad_norm": 0.4241188982146622, "learning_rate": 2.6151315789473686e-05, "loss": 0.617, "step": 159 }, { "epoch": 0.1582591493570722, "grad_norm": 0.45540106668453567, "learning_rate": 2.6315789473684212e-05, "loss": 0.6104, "step": 160 }, { "epoch": 0.15924826904055392, "grad_norm": 0.4247150549586619, "learning_rate": 2.6480263157894737e-05, "loss": 0.6001, "step": 161 }, { "epoch": 0.16023738872403562, "grad_norm": 0.4460228840551562, "learning_rate": 2.6644736842105266e-05, "loss": 0.5465, "step": 162 }, { "epoch": 0.16122650840751732, "grad_norm": 0.38351087890774094, "learning_rate": 2.6809210526315792e-05, "loss": 0.6591, "step": 163 }, { "epoch": 0.16221562809099901, "grad_norm": 0.46934047916659694, "learning_rate": 2.6973684210526317e-05, "loss": 0.7279, "step": 164 }, { "epoch": 0.1632047477744807, "grad_norm": 0.4222690324413176, "learning_rate": 2.7138157894736843e-05, "loss": 0.6264, "step": 165 }, { "epoch": 0.1641938674579624, "grad_norm": 0.4697752681692279, "learning_rate": 2.730263157894737e-05, "loss": 0.7027, "step": 166 }, { "epoch": 0.1651829871414441, "grad_norm": 0.4702974530466207, "learning_rate": 2.7467105263157894e-05, "loss": 0.6994, "step": 167 }, { "epoch": 0.1661721068249258, "grad_norm": 0.541483976312347, "learning_rate": 2.7631578947368426e-05, "loss": 0.6525, "step": 168 }, { "epoch": 0.1671612265084075, "grad_norm": 0.3428171204640418, "learning_rate": 2.779605263157895e-05, "loss": 0.6431, "step": 169 }, { "epoch": 0.1681503461918892, "grad_norm": 0.472693449324751, "learning_rate": 2.7960526315789477e-05, "loss": 0.6693, "step": 170 }, { "epoch": 0.16913946587537093, "grad_norm": 0.4554753159492845, "learning_rate": 2.8125000000000003e-05, "loss": 0.6924, "step": 171 }, { "epoch": 0.17012858555885263, "grad_norm": 0.4616962159670467, "learning_rate": 2.8289473684210528e-05, "loss": 0.6082, "step": 172 }, { "epoch": 0.17111770524233433, "grad_norm": 0.42013241457970946, "learning_rate": 2.8453947368421054e-05, "loss": 0.5976, "step": 173 }, { "epoch": 0.17210682492581603, "grad_norm": 0.49647836517813876, "learning_rate": 2.861842105263158e-05, "loss": 0.6976, "step": 174 }, { "epoch": 0.17309594460929772, "grad_norm": 0.4656047888630097, "learning_rate": 2.8782894736842108e-05, "loss": 0.6689, "step": 175 }, { "epoch": 0.17408506429277942, "grad_norm": 0.62173274981684, "learning_rate": 2.8947368421052634e-05, "loss": 0.5864, "step": 176 }, { "epoch": 0.17507418397626112, "grad_norm": 0.44514460495733604, "learning_rate": 2.911184210526316e-05, "loss": 0.5959, "step": 177 }, { "epoch": 0.17606330365974282, "grad_norm": 0.6575269476526132, "learning_rate": 2.9276315789473684e-05, "loss": 0.6128, "step": 178 }, { "epoch": 0.17705242334322452, "grad_norm": 0.4280454874654953, "learning_rate": 2.944078947368421e-05, "loss": 0.7002, "step": 179 }, { "epoch": 0.17804154302670624, "grad_norm": 0.455273030279468, "learning_rate": 2.9605263157894735e-05, "loss": 0.682, "step": 180 }, { "epoch": 0.17903066271018794, "grad_norm": 0.4558325615984045, "learning_rate": 2.9769736842105268e-05, "loss": 0.7028, "step": 181 }, { "epoch": 0.18001978239366964, "grad_norm": 0.5877708520421765, "learning_rate": 2.9934210526315793e-05, "loss": 0.6836, "step": 182 }, { "epoch": 0.18100890207715134, "grad_norm": 0.4596794024140408, "learning_rate": 3.009868421052632e-05, "loss": 0.6797, "step": 183 }, { "epoch": 0.18199802176063304, "grad_norm": 0.4319051245779275, "learning_rate": 3.0263157894736844e-05, "loss": 0.6282, "step": 184 }, { "epoch": 0.18298714144411474, "grad_norm": 0.5318817072015158, "learning_rate": 3.042763157894737e-05, "loss": 0.6233, "step": 185 }, { "epoch": 0.18397626112759644, "grad_norm": 0.4322921072671814, "learning_rate": 3.059210526315789e-05, "loss": 0.6573, "step": 186 }, { "epoch": 0.18496538081107813, "grad_norm": 0.4885053114909321, "learning_rate": 3.075657894736843e-05, "loss": 0.6871, "step": 187 }, { "epoch": 0.18595450049455983, "grad_norm": 0.4473265190512735, "learning_rate": 3.092105263157895e-05, "loss": 0.6173, "step": 188 }, { "epoch": 0.18694362017804153, "grad_norm": 0.5440786287668955, "learning_rate": 3.108552631578948e-05, "loss": 0.6141, "step": 189 }, { "epoch": 0.18793273986152326, "grad_norm": 0.35769667117178866, "learning_rate": 3.125e-05, "loss": 0.694, "step": 190 }, { "epoch": 0.18892185954500496, "grad_norm": 0.5059649250810946, "learning_rate": 3.141447368421053e-05, "loss": 0.6728, "step": 191 }, { "epoch": 0.18991097922848665, "grad_norm": 0.38729560510609135, "learning_rate": 3.157894736842105e-05, "loss": 0.5709, "step": 192 }, { "epoch": 0.19090009891196835, "grad_norm": 0.5371616873491071, "learning_rate": 3.174342105263158e-05, "loss": 0.7155, "step": 193 }, { "epoch": 0.19188921859545005, "grad_norm": 0.6053710744464506, "learning_rate": 3.190789473684211e-05, "loss": 0.6955, "step": 194 }, { "epoch": 0.19287833827893175, "grad_norm": 0.413136441297711, "learning_rate": 3.207236842105263e-05, "loss": 0.5837, "step": 195 }, { "epoch": 0.19386745796241345, "grad_norm": 0.6398949332164223, "learning_rate": 3.223684210526316e-05, "loss": 0.7342, "step": 196 }, { "epoch": 0.19485657764589515, "grad_norm": 0.61809961884169, "learning_rate": 3.240131578947368e-05, "loss": 0.6147, "step": 197 }, { "epoch": 0.19584569732937684, "grad_norm": 0.44467152661669995, "learning_rate": 3.256578947368421e-05, "loss": 0.5646, "step": 198 }, { "epoch": 0.19683481701285854, "grad_norm": 0.4851641919265251, "learning_rate": 3.2730263157894734e-05, "loss": 0.6334, "step": 199 }, { "epoch": 0.19782393669634027, "grad_norm": 0.5062493576014657, "learning_rate": 3.289473684210527e-05, "loss": 0.5911, "step": 200 }, { "epoch": 0.19881305637982197, "grad_norm": 0.5527966709235791, "learning_rate": 3.305921052631579e-05, "loss": 0.6273, "step": 201 }, { "epoch": 0.19980217606330367, "grad_norm": 0.633334866950127, "learning_rate": 3.322368421052632e-05, "loss": 0.7025, "step": 202 }, { "epoch": 0.20079129574678536, "grad_norm": 0.48779295740663486, "learning_rate": 3.338815789473684e-05, "loss": 0.5861, "step": 203 }, { "epoch": 0.20178041543026706, "grad_norm": 0.6070005751744674, "learning_rate": 3.355263157894737e-05, "loss": 0.7428, "step": 204 }, { "epoch": 0.20276953511374876, "grad_norm": 0.534136315876146, "learning_rate": 3.371710526315789e-05, "loss": 0.5728, "step": 205 }, { "epoch": 0.20375865479723046, "grad_norm": 0.48792763692000546, "learning_rate": 3.388157894736842e-05, "loss": 0.586, "step": 206 }, { "epoch": 0.20474777448071216, "grad_norm": 0.52009416122117, "learning_rate": 3.404605263157895e-05, "loss": 0.4851, "step": 207 }, { "epoch": 0.20573689416419386, "grad_norm": 0.5698103555541044, "learning_rate": 3.421052631578947e-05, "loss": 0.6331, "step": 208 }, { "epoch": 0.20672601384767555, "grad_norm": 0.509391746916777, "learning_rate": 3.4375e-05, "loss": 0.5839, "step": 209 }, { "epoch": 0.20771513353115728, "grad_norm": 0.6184628148195033, "learning_rate": 3.4539473684210524e-05, "loss": 0.7618, "step": 210 }, { "epoch": 0.20870425321463898, "grad_norm": 0.55512739516661, "learning_rate": 3.470394736842105e-05, "loss": 0.6801, "step": 211 }, { "epoch": 0.20969337289812068, "grad_norm": 0.6380611857299497, "learning_rate": 3.4868421052631575e-05, "loss": 0.6445, "step": 212 }, { "epoch": 0.21068249258160238, "grad_norm": 0.48106021749776834, "learning_rate": 3.503289473684211e-05, "loss": 0.704, "step": 213 }, { "epoch": 0.21167161226508407, "grad_norm": 0.5155510274611654, "learning_rate": 3.519736842105263e-05, "loss": 0.603, "step": 214 }, { "epoch": 0.21266073194856577, "grad_norm": 0.41115306939250096, "learning_rate": 3.536184210526316e-05, "loss": 0.6204, "step": 215 }, { "epoch": 0.21364985163204747, "grad_norm": 0.42847524914883034, "learning_rate": 3.5526315789473684e-05, "loss": 0.6293, "step": 216 }, { "epoch": 0.21463897131552917, "grad_norm": 0.40362083266366067, "learning_rate": 3.569078947368421e-05, "loss": 0.6025, "step": 217 }, { "epoch": 0.21562809099901087, "grad_norm": 0.36583415568507316, "learning_rate": 3.5855263157894735e-05, "loss": 0.6429, "step": 218 }, { "epoch": 0.2166172106824926, "grad_norm": 0.3792237531903649, "learning_rate": 3.6019736842105264e-05, "loss": 0.6304, "step": 219 }, { "epoch": 0.2176063303659743, "grad_norm": 0.5016218500071241, "learning_rate": 3.618421052631579e-05, "loss": 0.6072, "step": 220 }, { "epoch": 0.218595450049456, "grad_norm": 0.5195805824452137, "learning_rate": 3.6348684210526315e-05, "loss": 0.6955, "step": 221 }, { "epoch": 0.2195845697329377, "grad_norm": 0.40646432999857474, "learning_rate": 3.6513157894736844e-05, "loss": 0.5941, "step": 222 }, { "epoch": 0.2205736894164194, "grad_norm": 0.48614279865705684, "learning_rate": 3.6677631578947366e-05, "loss": 0.7392, "step": 223 }, { "epoch": 0.2215628090999011, "grad_norm": 0.4833284553690217, "learning_rate": 3.6842105263157895e-05, "loss": 0.7144, "step": 224 }, { "epoch": 0.22255192878338279, "grad_norm": 0.4123480628971663, "learning_rate": 3.7006578947368424e-05, "loss": 0.5997, "step": 225 }, { "epoch": 0.22354104846686448, "grad_norm": 0.58482567880444, "learning_rate": 3.717105263157895e-05, "loss": 0.625, "step": 226 }, { "epoch": 0.22453016815034618, "grad_norm": 0.4839150358758274, "learning_rate": 3.7335526315789475e-05, "loss": 0.5947, "step": 227 }, { "epoch": 0.22551928783382788, "grad_norm": 0.5821490050298971, "learning_rate": 3.7500000000000003e-05, "loss": 0.63, "step": 228 }, { "epoch": 0.2265084075173096, "grad_norm": 0.4025831144151843, "learning_rate": 3.7664473684210526e-05, "loss": 0.5721, "step": 229 }, { "epoch": 0.2274975272007913, "grad_norm": 0.5564325010312093, "learning_rate": 3.7828947368421054e-05, "loss": 0.6745, "step": 230 }, { "epoch": 0.228486646884273, "grad_norm": 0.559427271458402, "learning_rate": 3.7993421052631577e-05, "loss": 0.6608, "step": 231 }, { "epoch": 0.2294757665677547, "grad_norm": 0.4961691077638613, "learning_rate": 3.815789473684211e-05, "loss": 0.673, "step": 232 }, { "epoch": 0.2304648862512364, "grad_norm": 0.4451618595226397, "learning_rate": 3.8322368421052634e-05, "loss": 0.6879, "step": 233 }, { "epoch": 0.2314540059347181, "grad_norm": 0.4764748813476515, "learning_rate": 3.848684210526316e-05, "loss": 0.6593, "step": 234 }, { "epoch": 0.2324431256181998, "grad_norm": 0.4781168643440134, "learning_rate": 3.8651315789473685e-05, "loss": 0.6521, "step": 235 }, { "epoch": 0.2334322453016815, "grad_norm": 0.4317515889033491, "learning_rate": 3.8815789473684214e-05, "loss": 0.7187, "step": 236 }, { "epoch": 0.2344213649851632, "grad_norm": 0.5010679062349306, "learning_rate": 3.8980263157894736e-05, "loss": 0.6075, "step": 237 }, { "epoch": 0.2354104846686449, "grad_norm": 0.48215015664396305, "learning_rate": 3.9144736842105265e-05, "loss": 0.6168, "step": 238 }, { "epoch": 0.23639960435212662, "grad_norm": 0.4522478488377238, "learning_rate": 3.9309210526315794e-05, "loss": 0.6396, "step": 239 }, { "epoch": 0.23738872403560832, "grad_norm": 0.4577399461188642, "learning_rate": 3.9473684210526316e-05, "loss": 0.7265, "step": 240 }, { "epoch": 0.23837784371909002, "grad_norm": 2.1998133857858337, "learning_rate": 3.9638157894736845e-05, "loss": 0.6542, "step": 241 }, { "epoch": 0.23936696340257171, "grad_norm": 0.4803483208680404, "learning_rate": 3.980263157894737e-05, "loss": 0.6325, "step": 242 }, { "epoch": 0.2403560830860534, "grad_norm": 0.42001637823181315, "learning_rate": 3.9967105263157896e-05, "loss": 0.6449, "step": 243 }, { "epoch": 0.2413452027695351, "grad_norm": 0.6051649931482518, "learning_rate": 4.0131578947368425e-05, "loss": 0.6015, "step": 244 }, { "epoch": 0.2423343224530168, "grad_norm": 0.462743464454851, "learning_rate": 4.0296052631578954e-05, "loss": 0.6353, "step": 245 }, { "epoch": 0.2433234421364985, "grad_norm": 0.5965079725135056, "learning_rate": 4.0460526315789476e-05, "loss": 0.6813, "step": 246 }, { "epoch": 0.2443125618199802, "grad_norm": 0.515915006211205, "learning_rate": 4.0625000000000005e-05, "loss": 0.6897, "step": 247 }, { "epoch": 0.24530168150346193, "grad_norm": 0.5532763347341395, "learning_rate": 4.078947368421053e-05, "loss": 0.7042, "step": 248 }, { "epoch": 0.24629080118694363, "grad_norm": 0.46137687834271673, "learning_rate": 4.0953947368421056e-05, "loss": 0.5839, "step": 249 }, { "epoch": 0.24727992087042533, "grad_norm": 0.47745663163187557, "learning_rate": 4.111842105263158e-05, "loss": 0.6566, "step": 250 }, { "epoch": 0.24826904055390703, "grad_norm": 0.44493193669631287, "learning_rate": 4.128289473684211e-05, "loss": 0.6223, "step": 251 }, { "epoch": 0.24925816023738873, "grad_norm": 0.4169310337329136, "learning_rate": 4.1447368421052636e-05, "loss": 0.5799, "step": 252 }, { "epoch": 0.2502472799208704, "grad_norm": 0.5324378545388998, "learning_rate": 4.161184210526316e-05, "loss": 0.586, "step": 253 }, { "epoch": 0.2512363996043521, "grad_norm": 0.4713752675672079, "learning_rate": 4.177631578947369e-05, "loss": 0.5844, "step": 254 }, { "epoch": 0.2522255192878338, "grad_norm": 0.4246792763221131, "learning_rate": 4.194078947368421e-05, "loss": 0.6691, "step": 255 }, { "epoch": 0.2532146389713155, "grad_norm": 0.5996820122986719, "learning_rate": 4.210526315789474e-05, "loss": 0.6205, "step": 256 }, { "epoch": 0.2542037586547972, "grad_norm": 0.47226024365214625, "learning_rate": 4.226973684210527e-05, "loss": 0.6654, "step": 257 }, { "epoch": 0.2551928783382789, "grad_norm": 0.4718080623174236, "learning_rate": 4.2434210526315796e-05, "loss": 0.5252, "step": 258 }, { "epoch": 0.2561819980217606, "grad_norm": 0.6350779709078309, "learning_rate": 4.259868421052632e-05, "loss": 0.6221, "step": 259 }, { "epoch": 0.2571711177052423, "grad_norm": 0.5362489170689355, "learning_rate": 4.2763157894736847e-05, "loss": 0.59, "step": 260 }, { "epoch": 0.258160237388724, "grad_norm": 0.5037993125750314, "learning_rate": 4.292763157894737e-05, "loss": 0.5977, "step": 261 }, { "epoch": 0.2591493570722057, "grad_norm": 0.7119819210503119, "learning_rate": 4.30921052631579e-05, "loss": 0.5687, "step": 262 }, { "epoch": 0.26013847675568746, "grad_norm": 0.48734181172443314, "learning_rate": 4.3256578947368426e-05, "loss": 0.5818, "step": 263 }, { "epoch": 0.26112759643916916, "grad_norm": 0.5307408330986926, "learning_rate": 4.342105263157895e-05, "loss": 0.5485, "step": 264 }, { "epoch": 0.26211671612265086, "grad_norm": 0.6102212182790211, "learning_rate": 4.358552631578948e-05, "loss": 0.6235, "step": 265 }, { "epoch": 0.26310583580613256, "grad_norm": 0.5928998977874212, "learning_rate": 4.375e-05, "loss": 0.6044, "step": 266 }, { "epoch": 0.26409495548961426, "grad_norm": 0.6688550804132813, "learning_rate": 4.391447368421053e-05, "loss": 0.6217, "step": 267 }, { "epoch": 0.26508407517309596, "grad_norm": 0.7818753906542993, "learning_rate": 4.407894736842105e-05, "loss": 0.5282, "step": 268 }, { "epoch": 0.26607319485657766, "grad_norm": 1.0091627489834931, "learning_rate": 4.424342105263158e-05, "loss": 0.5692, "step": 269 }, { "epoch": 0.26706231454005935, "grad_norm": 0.6011818743170259, "learning_rate": 4.440789473684211e-05, "loss": 0.6119, "step": 270 }, { "epoch": 0.26805143422354105, "grad_norm": 0.8838865925562805, "learning_rate": 4.457236842105264e-05, "loss": 0.6043, "step": 271 }, { "epoch": 0.26904055390702275, "grad_norm": 0.6314911229429284, "learning_rate": 4.473684210526316e-05, "loss": 0.7673, "step": 272 }, { "epoch": 0.27002967359050445, "grad_norm": 0.6804637239015356, "learning_rate": 4.490131578947369e-05, "loss": 0.5926, "step": 273 }, { "epoch": 0.27101879327398615, "grad_norm": 0.547035473768601, "learning_rate": 4.506578947368421e-05, "loss": 0.5431, "step": 274 }, { "epoch": 0.27200791295746785, "grad_norm": 0.49151905906223226, "learning_rate": 4.523026315789474e-05, "loss": 0.5535, "step": 275 }, { "epoch": 0.27299703264094954, "grad_norm": 0.4380519372013023, "learning_rate": 4.539473684210527e-05, "loss": 0.7109, "step": 276 }, { "epoch": 0.27398615232443124, "grad_norm": 0.7217367910139244, "learning_rate": 4.555921052631579e-05, "loss": 0.6292, "step": 277 }, { "epoch": 0.27497527200791294, "grad_norm": 0.5585776926241103, "learning_rate": 4.572368421052632e-05, "loss": 0.6974, "step": 278 }, { "epoch": 0.27596439169139464, "grad_norm": 0.4555730594700509, "learning_rate": 4.588815789473684e-05, "loss": 0.6425, "step": 279 }, { "epoch": 0.27695351137487634, "grad_norm": 0.6389516893536279, "learning_rate": 4.605263157894737e-05, "loss": 0.5537, "step": 280 }, { "epoch": 0.27794263105835804, "grad_norm": 0.46095286770730254, "learning_rate": 4.621710526315789e-05, "loss": 0.6019, "step": 281 }, { "epoch": 0.2789317507418398, "grad_norm": 0.44048312996010536, "learning_rate": 4.638157894736843e-05, "loss": 0.612, "step": 282 }, { "epoch": 0.2799208704253215, "grad_norm": 1.0389287178149595, "learning_rate": 4.654605263157895e-05, "loss": 0.6571, "step": 283 }, { "epoch": 0.2809099901088032, "grad_norm": 0.6903364570618536, "learning_rate": 4.671052631578948e-05, "loss": 0.6973, "step": 284 }, { "epoch": 0.2818991097922849, "grad_norm": 0.4477703451666338, "learning_rate": 4.6875e-05, "loss": 0.5894, "step": 285 }, { "epoch": 0.2828882294757666, "grad_norm": 0.71253667717858, "learning_rate": 4.703947368421053e-05, "loss": 0.6158, "step": 286 }, { "epoch": 0.2838773491592483, "grad_norm": 0.6605589318131366, "learning_rate": 4.720394736842105e-05, "loss": 0.5547, "step": 287 }, { "epoch": 0.28486646884273, "grad_norm": 0.4142085186922097, "learning_rate": 4.736842105263158e-05, "loss": 0.6035, "step": 288 }, { "epoch": 0.2858555885262117, "grad_norm": 0.5522552002749757, "learning_rate": 4.753289473684211e-05, "loss": 0.6205, "step": 289 }, { "epoch": 0.2868447082096934, "grad_norm": 0.4553692557081208, "learning_rate": 4.769736842105263e-05, "loss": 0.5847, "step": 290 }, { "epoch": 0.2878338278931751, "grad_norm": 0.4677603638371184, "learning_rate": 4.786184210526316e-05, "loss": 0.6153, "step": 291 }, { "epoch": 0.2888229475766568, "grad_norm": 0.716448078885814, "learning_rate": 4.802631578947368e-05, "loss": 0.6088, "step": 292 }, { "epoch": 0.2898120672601385, "grad_norm": 0.5629123910634363, "learning_rate": 4.819078947368421e-05, "loss": 0.7418, "step": 293 }, { "epoch": 0.29080118694362017, "grad_norm": 0.5684632906175011, "learning_rate": 4.8355263157894734e-05, "loss": 0.6617, "step": 294 }, { "epoch": 0.29179030662710187, "grad_norm": 0.7817295121483087, "learning_rate": 4.851973684210527e-05, "loss": 0.6587, "step": 295 }, { "epoch": 0.29277942631058357, "grad_norm": 0.43426647960692005, "learning_rate": 4.868421052631579e-05, "loss": 0.5338, "step": 296 }, { "epoch": 0.29376854599406527, "grad_norm": 0.7019981788147629, "learning_rate": 4.884868421052632e-05, "loss": 0.6053, "step": 297 }, { "epoch": 0.29475766567754697, "grad_norm": 3.0951979705836496, "learning_rate": 4.901315789473684e-05, "loss": 0.5918, "step": 298 }, { "epoch": 0.29574678536102866, "grad_norm": 1.0233134362024605, "learning_rate": 4.917763157894737e-05, "loss": 0.6081, "step": 299 }, { "epoch": 0.29673590504451036, "grad_norm": 0.9953914245723955, "learning_rate": 4.9342105263157894e-05, "loss": 0.5341, "step": 300 }, { "epoch": 0.29772502472799206, "grad_norm": 0.7092328246731249, "learning_rate": 4.950657894736843e-05, "loss": 0.7031, "step": 301 }, { "epoch": 0.2987141444114738, "grad_norm": 1.1657097575552566, "learning_rate": 4.967105263157895e-05, "loss": 0.5386, "step": 302 }, { "epoch": 0.2997032640949555, "grad_norm": 0.64809864753375, "learning_rate": 4.983552631578948e-05, "loss": 0.5822, "step": 303 }, { "epoch": 0.3006923837784372, "grad_norm": 10.779831456434477, "learning_rate": 5e-05, "loss": 1.3367, "step": 304 }, { "epoch": 0.3016815034619189, "grad_norm": 2.263953506492712, "learning_rate": 4.9981678270428736e-05, "loss": 0.652, "step": 305 }, { "epoch": 0.3026706231454006, "grad_norm": 1.1026406639523274, "learning_rate": 4.996335654085746e-05, "loss": 0.5982, "step": 306 }, { "epoch": 0.3036597428288823, "grad_norm": 1.5390268402197822, "learning_rate": 4.994503481128619e-05, "loss": 0.5838, "step": 307 }, { "epoch": 0.304648862512364, "grad_norm": 2.325507669292494, "learning_rate": 4.9926713081714915e-05, "loss": 0.7096, "step": 308 }, { "epoch": 0.3056379821958457, "grad_norm": 0.9681764004322482, "learning_rate": 4.990839135214365e-05, "loss": 0.5579, "step": 309 }, { "epoch": 0.3066271018793274, "grad_norm": 1.1536956780162377, "learning_rate": 4.9890069622572374e-05, "loss": 0.6127, "step": 310 }, { "epoch": 0.3076162215628091, "grad_norm": 0.934115930451099, "learning_rate": 4.98717478930011e-05, "loss": 0.5808, "step": 311 }, { "epoch": 0.3086053412462908, "grad_norm": 1.1749017282382166, "learning_rate": 4.985342616342983e-05, "loss": 0.6422, "step": 312 }, { "epoch": 0.3095944609297725, "grad_norm": 0.8515985999112896, "learning_rate": 4.983510443385856e-05, "loss": 0.6175, "step": 313 }, { "epoch": 0.3105835806132542, "grad_norm": 6.518960373233818, "learning_rate": 4.981678270428729e-05, "loss": 0.8142, "step": 314 }, { "epoch": 0.3115727002967359, "grad_norm": 2.1921818273490374, "learning_rate": 4.979846097471601e-05, "loss": 0.6226, "step": 315 }, { "epoch": 0.3125618199802176, "grad_norm": 1.6420624734489004, "learning_rate": 4.9780139245144747e-05, "loss": 0.6004, "step": 316 }, { "epoch": 0.3135509396636993, "grad_norm": 0.7686264715968514, "learning_rate": 4.976181751557347e-05, "loss": 0.587, "step": 317 }, { "epoch": 0.314540059347181, "grad_norm": 1.6301825964138899, "learning_rate": 4.9743495786002206e-05, "loss": 0.5783, "step": 318 }, { "epoch": 0.3155291790306627, "grad_norm": 1.7471449432767963, "learning_rate": 4.9725174056430926e-05, "loss": 0.6721, "step": 319 }, { "epoch": 0.3165182987141444, "grad_norm": 1.1192919506428751, "learning_rate": 4.970685232685966e-05, "loss": 0.6192, "step": 320 }, { "epoch": 0.31750741839762614, "grad_norm": 1.269040128114316, "learning_rate": 4.9688530597288385e-05, "loss": 0.6129, "step": 321 }, { "epoch": 0.31849653808110784, "grad_norm": 0.828446461431732, "learning_rate": 4.967020886771712e-05, "loss": 0.6483, "step": 322 }, { "epoch": 0.31948565776458954, "grad_norm": 0.8666239492089782, "learning_rate": 4.9651887138145845e-05, "loss": 0.5733, "step": 323 }, { "epoch": 0.32047477744807124, "grad_norm": 0.7549123620762485, "learning_rate": 4.963356540857457e-05, "loss": 0.6196, "step": 324 }, { "epoch": 0.32146389713155293, "grad_norm": 1.8217639567133426, "learning_rate": 4.96152436790033e-05, "loss": 0.6356, "step": 325 }, { "epoch": 0.32245301681503463, "grad_norm": 2.292729397977442, "learning_rate": 4.959692194943203e-05, "loss": 0.6607, "step": 326 }, { "epoch": 0.32344213649851633, "grad_norm": 0.83818756217215, "learning_rate": 4.957860021986076e-05, "loss": 0.6217, "step": 327 }, { "epoch": 0.32443125618199803, "grad_norm": 0.676734652693397, "learning_rate": 4.9560278490289484e-05, "loss": 0.592, "step": 328 }, { "epoch": 0.3254203758654797, "grad_norm": 0.5902774697246586, "learning_rate": 4.954195676071822e-05, "loss": 0.6153, "step": 329 }, { "epoch": 0.3264094955489614, "grad_norm": 0.800723034946388, "learning_rate": 4.9523635031146943e-05, "loss": 0.664, "step": 330 }, { "epoch": 0.3273986152324431, "grad_norm": 0.5767708938288155, "learning_rate": 4.950531330157567e-05, "loss": 0.5934, "step": 331 }, { "epoch": 0.3283877349159248, "grad_norm": 0.7493746467171817, "learning_rate": 4.9486991572004396e-05, "loss": 0.633, "step": 332 }, { "epoch": 0.3293768545994065, "grad_norm": 0.8168408769910189, "learning_rate": 4.946866984243313e-05, "loss": 0.5523, "step": 333 }, { "epoch": 0.3303659742828882, "grad_norm": 0.6357012806203373, "learning_rate": 4.9450348112861856e-05, "loss": 0.5584, "step": 334 }, { "epoch": 0.3313550939663699, "grad_norm": 0.9570738821630661, "learning_rate": 4.943202638329059e-05, "loss": 0.5602, "step": 335 }, { "epoch": 0.3323442136498516, "grad_norm": 0.6271482411746069, "learning_rate": 4.941370465371931e-05, "loss": 0.6037, "step": 336 }, { "epoch": 0.3333333333333333, "grad_norm": 0.8342569793379687, "learning_rate": 4.939538292414804e-05, "loss": 0.6013, "step": 337 }, { "epoch": 0.334322453016815, "grad_norm": 0.8429515515235894, "learning_rate": 4.937706119457677e-05, "loss": 0.5744, "step": 338 }, { "epoch": 0.3353115727002967, "grad_norm": 0.41480878493392426, "learning_rate": 4.93587394650055e-05, "loss": 0.533, "step": 339 }, { "epoch": 0.3363006923837784, "grad_norm": 1.0762372950869659, "learning_rate": 4.934041773543423e-05, "loss": 0.6092, "step": 340 }, { "epoch": 0.33728981206726016, "grad_norm": 0.5352412089968016, "learning_rate": 4.9322096005862954e-05, "loss": 0.5744, "step": 341 }, { "epoch": 0.33827893175074186, "grad_norm": 11.795603000530546, "learning_rate": 4.930377427629169e-05, "loss": 0.8007, "step": 342 }, { "epoch": 0.33926805143422356, "grad_norm": 1.6654251993458582, "learning_rate": 4.9285452546720414e-05, "loss": 0.6833, "step": 343 }, { "epoch": 0.34025717111770526, "grad_norm": 1.023245256880444, "learning_rate": 4.926713081714914e-05, "loss": 0.5568, "step": 344 }, { "epoch": 0.34124629080118696, "grad_norm": 0.9874560723539015, "learning_rate": 4.924880908757787e-05, "loss": 0.6252, "step": 345 }, { "epoch": 0.34223541048466866, "grad_norm": 1.1346945125596772, "learning_rate": 4.92304873580066e-05, "loss": 0.667, "step": 346 }, { "epoch": 0.34322453016815035, "grad_norm": 0.7734482167051114, "learning_rate": 4.9212165628435326e-05, "loss": 0.5809, "step": 347 }, { "epoch": 0.34421364985163205, "grad_norm": 0.8763032330208194, "learning_rate": 4.919384389886405e-05, "loss": 0.6812, "step": 348 }, { "epoch": 0.34520276953511375, "grad_norm": 0.8014512357084477, "learning_rate": 4.917552216929278e-05, "loss": 0.5391, "step": 349 }, { "epoch": 0.34619188921859545, "grad_norm": 0.6557118095626164, "learning_rate": 4.915720043972151e-05, "loss": 0.5202, "step": 350 }, { "epoch": 0.34718100890207715, "grad_norm": 0.6296821730209303, "learning_rate": 4.913887871015024e-05, "loss": 0.4884, "step": 351 }, { "epoch": 0.34817012858555885, "grad_norm": 0.7422194964288009, "learning_rate": 4.912055698057897e-05, "loss": 0.5452, "step": 352 }, { "epoch": 0.34915924826904055, "grad_norm": 0.5946076118500246, "learning_rate": 4.91022352510077e-05, "loss": 0.6166, "step": 353 }, { "epoch": 0.35014836795252224, "grad_norm": 0.732570052375268, "learning_rate": 4.9083913521436425e-05, "loss": 0.6611, "step": 354 }, { "epoch": 0.35113748763600394, "grad_norm": 0.4698106032909168, "learning_rate": 4.906559179186516e-05, "loss": 0.5601, "step": 355 }, { "epoch": 0.35212660731948564, "grad_norm": 1.2453092526176186, "learning_rate": 4.9047270062293885e-05, "loss": 0.6009, "step": 356 }, { "epoch": 0.35311572700296734, "grad_norm": 0.4418805854104305, "learning_rate": 4.902894833272261e-05, "loss": 0.6021, "step": 357 }, { "epoch": 0.35410484668644904, "grad_norm": 0.5326586341210466, "learning_rate": 4.901062660315134e-05, "loss": 0.5839, "step": 358 }, { "epoch": 0.35509396636993074, "grad_norm": 0.41554111643426483, "learning_rate": 4.899230487358007e-05, "loss": 0.5615, "step": 359 }, { "epoch": 0.3560830860534125, "grad_norm": 0.534276978725234, "learning_rate": 4.89739831440088e-05, "loss": 0.6854, "step": 360 }, { "epoch": 0.3570722057368942, "grad_norm": 0.41429157493399854, "learning_rate": 4.8955661414437523e-05, "loss": 0.5839, "step": 361 }, { "epoch": 0.3580613254203759, "grad_norm": 1.8567649357537324, "learning_rate": 4.893733968486625e-05, "loss": 0.5924, "step": 362 }, { "epoch": 0.3590504451038576, "grad_norm": 0.8369610972802223, "learning_rate": 4.891901795529498e-05, "loss": 0.6518, "step": 363 }, { "epoch": 0.3600395647873393, "grad_norm": 0.5759620691607789, "learning_rate": 4.8900696225723716e-05, "loss": 0.5869, "step": 364 }, { "epoch": 0.361028684470821, "grad_norm": 0.8489394500046676, "learning_rate": 4.8882374496152436e-05, "loss": 0.621, "step": 365 }, { "epoch": 0.3620178041543027, "grad_norm": 0.6560658008069779, "learning_rate": 4.886405276658117e-05, "loss": 0.6094, "step": 366 }, { "epoch": 0.3630069238377844, "grad_norm": 0.7865989416550275, "learning_rate": 4.8845731037009895e-05, "loss": 0.5788, "step": 367 }, { "epoch": 0.3639960435212661, "grad_norm": 0.6537131935874835, "learning_rate": 4.882740930743863e-05, "loss": 0.5797, "step": 368 }, { "epoch": 0.3649851632047478, "grad_norm": 0.6909493769793963, "learning_rate": 4.8809087577867355e-05, "loss": 0.5842, "step": 369 }, { "epoch": 0.3659742828882295, "grad_norm": 0.4769547461410241, "learning_rate": 4.879076584829608e-05, "loss": 0.6062, "step": 370 }, { "epoch": 0.3669634025717112, "grad_norm": 0.8200716144700505, "learning_rate": 4.877244411872481e-05, "loss": 0.6748, "step": 371 }, { "epoch": 0.36795252225519287, "grad_norm": 0.45770566187660844, "learning_rate": 4.875412238915354e-05, "loss": 0.527, "step": 372 }, { "epoch": 0.36894164193867457, "grad_norm": 0.6555897440490205, "learning_rate": 4.873580065958227e-05, "loss": 0.5222, "step": 373 }, { "epoch": 0.36993076162215627, "grad_norm": 0.7580354956119526, "learning_rate": 4.8717478930010994e-05, "loss": 0.6309, "step": 374 }, { "epoch": 0.37091988130563797, "grad_norm": 0.5439082223239077, "learning_rate": 4.869915720043972e-05, "loss": 0.5571, "step": 375 }, { "epoch": 0.37190900098911966, "grad_norm": 0.8331347227307716, "learning_rate": 4.8680835470868454e-05, "loss": 0.5607, "step": 376 }, { "epoch": 0.37289812067260136, "grad_norm": 0.5141516865612674, "learning_rate": 4.866251374129718e-05, "loss": 0.6567, "step": 377 }, { "epoch": 0.37388724035608306, "grad_norm": 0.7141235700823007, "learning_rate": 4.8644192011725906e-05, "loss": 0.5669, "step": 378 }, { "epoch": 0.37487636003956476, "grad_norm": 0.5560006726258888, "learning_rate": 4.862587028215464e-05, "loss": 0.6324, "step": 379 }, { "epoch": 0.3758654797230465, "grad_norm": 0.7409978312950718, "learning_rate": 4.8607548552583366e-05, "loss": 0.7242, "step": 380 }, { "epoch": 0.3768545994065282, "grad_norm": 0.5440930687616704, "learning_rate": 4.85892268230121e-05, "loss": 0.6254, "step": 381 }, { "epoch": 0.3778437190900099, "grad_norm": 0.6478252439008088, "learning_rate": 4.857090509344082e-05, "loss": 0.5264, "step": 382 }, { "epoch": 0.3788328387734916, "grad_norm": 0.4709599342774292, "learning_rate": 4.855258336386955e-05, "loss": 0.5647, "step": 383 }, { "epoch": 0.3798219584569733, "grad_norm": 1.18640771551092, "learning_rate": 4.853426163429828e-05, "loss": 0.7512, "step": 384 }, { "epoch": 0.380811078140455, "grad_norm": 0.3923235190086449, "learning_rate": 4.851593990472701e-05, "loss": 0.6049, "step": 385 }, { "epoch": 0.3818001978239367, "grad_norm": 0.4781166108075703, "learning_rate": 4.849761817515573e-05, "loss": 0.5453, "step": 386 }, { "epoch": 0.3827893175074184, "grad_norm": 0.3725786214381448, "learning_rate": 4.8479296445584464e-05, "loss": 0.5387, "step": 387 }, { "epoch": 0.3837784371909001, "grad_norm": 0.4987148506610239, "learning_rate": 4.84609747160132e-05, "loss": 0.547, "step": 388 }, { "epoch": 0.3847675568743818, "grad_norm": 0.35932327262892194, "learning_rate": 4.8442652986441924e-05, "loss": 0.5214, "step": 389 }, { "epoch": 0.3857566765578635, "grad_norm": 0.690564658045055, "learning_rate": 4.842433125687065e-05, "loss": 0.591, "step": 390 }, { "epoch": 0.3867457962413452, "grad_norm": 0.3463972013191391, "learning_rate": 4.840600952729938e-05, "loss": 0.6385, "step": 391 }, { "epoch": 0.3877349159248269, "grad_norm": 0.3543061291829264, "learning_rate": 4.838768779772811e-05, "loss": 0.517, "step": 392 }, { "epoch": 0.3887240356083086, "grad_norm": 0.3660023274107496, "learning_rate": 4.8369366068156837e-05, "loss": 0.5781, "step": 393 }, { "epoch": 0.3897131552917903, "grad_norm": 0.35899772143056496, "learning_rate": 4.835104433858556e-05, "loss": 0.6156, "step": 394 }, { "epoch": 0.390702274975272, "grad_norm": 0.4516133836964091, "learning_rate": 4.833272260901429e-05, "loss": 0.555, "step": 395 }, { "epoch": 0.3916913946587537, "grad_norm": 4.069658753953614, "learning_rate": 4.831440087944302e-05, "loss": 0.6786, "step": 396 }, { "epoch": 0.3926805143422354, "grad_norm": 0.5512608577859164, "learning_rate": 4.829607914987175e-05, "loss": 0.5819, "step": 397 }, { "epoch": 0.3936696340257171, "grad_norm": 0.6001718577059937, "learning_rate": 4.827775742030048e-05, "loss": 0.5866, "step": 398 }, { "epoch": 0.39465875370919884, "grad_norm": 0.43723220327977047, "learning_rate": 4.82594356907292e-05, "loss": 0.6223, "step": 399 }, { "epoch": 0.39564787339268054, "grad_norm": 0.4091883159043, "learning_rate": 4.8241113961157935e-05, "loss": 0.5937, "step": 400 }, { "epoch": 0.39663699307616224, "grad_norm": 0.3627182581990674, "learning_rate": 4.822279223158667e-05, "loss": 0.4927, "step": 401 }, { "epoch": 0.39762611275964393, "grad_norm": 3.395118475745649, "learning_rate": 4.8204470502015395e-05, "loss": 0.6651, "step": 402 }, { "epoch": 0.39861523244312563, "grad_norm": 0.503406651457647, "learning_rate": 4.818614877244412e-05, "loss": 0.6462, "step": 403 }, { "epoch": 0.39960435212660733, "grad_norm": 0.41164305421638864, "learning_rate": 4.816782704287285e-05, "loss": 0.5303, "step": 404 }, { "epoch": 0.40059347181008903, "grad_norm": 0.44166637125698777, "learning_rate": 4.814950531330158e-05, "loss": 0.5474, "step": 405 }, { "epoch": 0.40158259149357073, "grad_norm": 0.48008127596420697, "learning_rate": 4.813118358373031e-05, "loss": 0.6364, "step": 406 }, { "epoch": 0.4025717111770524, "grad_norm": 0.46798165410172304, "learning_rate": 4.8112861854159033e-05, "loss": 0.6383, "step": 407 }, { "epoch": 0.4035608308605341, "grad_norm": 0.44125266116511586, "learning_rate": 4.809454012458776e-05, "loss": 0.5955, "step": 408 }, { "epoch": 0.4045499505440158, "grad_norm": 0.4395464902377213, "learning_rate": 4.807621839501649e-05, "loss": 0.7118, "step": 409 }, { "epoch": 0.4055390702274975, "grad_norm": 0.3816754087372529, "learning_rate": 4.805789666544522e-05, "loss": 0.5148, "step": 410 }, { "epoch": 0.4065281899109792, "grad_norm": 0.41359029720451895, "learning_rate": 4.8039574935873946e-05, "loss": 0.588, "step": 411 }, { "epoch": 0.4075173095944609, "grad_norm": 0.4693298663552246, "learning_rate": 4.802125320630268e-05, "loss": 0.6457, "step": 412 }, { "epoch": 0.4085064292779426, "grad_norm": 0.4067116282641899, "learning_rate": 4.8002931476731406e-05, "loss": 0.5495, "step": 413 }, { "epoch": 0.4094955489614243, "grad_norm": 0.4272277873309838, "learning_rate": 4.798460974716014e-05, "loss": 0.6337, "step": 414 }, { "epoch": 0.410484668644906, "grad_norm": 0.4811569091836984, "learning_rate": 4.7966288017588865e-05, "loss": 0.6149, "step": 415 }, { "epoch": 0.4114737883283877, "grad_norm": 0.38122687491339136, "learning_rate": 4.794796628801759e-05, "loss": 0.5846, "step": 416 }, { "epoch": 0.4124629080118694, "grad_norm": 0.4862215332358342, "learning_rate": 4.792964455844632e-05, "loss": 0.6186, "step": 417 }, { "epoch": 0.4134520276953511, "grad_norm": 0.47460277960742403, "learning_rate": 4.791132282887505e-05, "loss": 0.5941, "step": 418 }, { "epoch": 0.41444114737883286, "grad_norm": 0.40971075817407676, "learning_rate": 4.789300109930378e-05, "loss": 0.5409, "step": 419 }, { "epoch": 0.41543026706231456, "grad_norm": 0.40503923340944115, "learning_rate": 4.7874679369732504e-05, "loss": 0.5656, "step": 420 }, { "epoch": 0.41641938674579626, "grad_norm": 0.47166101201631216, "learning_rate": 4.785635764016123e-05, "loss": 0.5734, "step": 421 }, { "epoch": 0.41740850642927796, "grad_norm": 0.4074965319088034, "learning_rate": 4.7838035910589964e-05, "loss": 0.552, "step": 422 }, { "epoch": 0.41839762611275966, "grad_norm": 0.39208549972043516, "learning_rate": 4.781971418101869e-05, "loss": 0.6738, "step": 423 }, { "epoch": 0.41938674579624136, "grad_norm": 0.6011637351858017, "learning_rate": 4.7801392451447416e-05, "loss": 0.5603, "step": 424 }, { "epoch": 0.42037586547972305, "grad_norm": 0.3726260226014429, "learning_rate": 4.778307072187615e-05, "loss": 0.5816, "step": 425 }, { "epoch": 0.42136498516320475, "grad_norm": 0.6236845405373085, "learning_rate": 4.7764748992304876e-05, "loss": 0.6482, "step": 426 }, { "epoch": 0.42235410484668645, "grad_norm": 3.2608951858084922, "learning_rate": 4.774642726273361e-05, "loss": 0.6115, "step": 427 }, { "epoch": 0.42334322453016815, "grad_norm": 0.5292345408652966, "learning_rate": 4.772810553316233e-05, "loss": 0.5683, "step": 428 }, { "epoch": 0.42433234421364985, "grad_norm": 0.7800091939799507, "learning_rate": 4.770978380359106e-05, "loss": 0.5404, "step": 429 }, { "epoch": 0.42532146389713155, "grad_norm": 0.3947675558178534, "learning_rate": 4.769146207401979e-05, "loss": 0.5609, "step": 430 }, { "epoch": 0.42631058358061324, "grad_norm": 8.27438656210951, "learning_rate": 4.767314034444852e-05, "loss": 0.8391, "step": 431 }, { "epoch": 0.42729970326409494, "grad_norm": 0.8743079722440797, "learning_rate": 4.765481861487724e-05, "loss": 0.5703, "step": 432 }, { "epoch": 0.42828882294757664, "grad_norm": 1.903800162095679, "learning_rate": 4.7636496885305975e-05, "loss": 0.5864, "step": 433 }, { "epoch": 0.42927794263105834, "grad_norm": 0.6629417966412571, "learning_rate": 4.76181751557347e-05, "loss": 0.5367, "step": 434 }, { "epoch": 0.43026706231454004, "grad_norm": 0.7219662282705351, "learning_rate": 4.7599853426163434e-05, "loss": 0.6087, "step": 435 }, { "epoch": 0.43125618199802174, "grad_norm": 0.4530515462755643, "learning_rate": 4.758153169659216e-05, "loss": 0.5095, "step": 436 }, { "epoch": 0.43224530168150344, "grad_norm": 0.7726116565074944, "learning_rate": 4.756320996702089e-05, "loss": 0.6424, "step": 437 }, { "epoch": 0.4332344213649852, "grad_norm": 0.5383329629673866, "learning_rate": 4.754488823744962e-05, "loss": 0.6031, "step": 438 }, { "epoch": 0.4342235410484669, "grad_norm": 0.6766939106280287, "learning_rate": 4.7526566507878347e-05, "loss": 0.6031, "step": 439 }, { "epoch": 0.4352126607319486, "grad_norm": 0.6940726364507884, "learning_rate": 4.750824477830707e-05, "loss": 0.5308, "step": 440 }, { "epoch": 0.4362017804154303, "grad_norm": 0.5386095591035055, "learning_rate": 4.74899230487358e-05, "loss": 0.5361, "step": 441 }, { "epoch": 0.437190900098912, "grad_norm": 0.6810339477078869, "learning_rate": 4.747160131916453e-05, "loss": 0.5189, "step": 442 }, { "epoch": 0.4381800197823937, "grad_norm": 0.4935297438689495, "learning_rate": 4.745327958959326e-05, "loss": 0.5648, "step": 443 }, { "epoch": 0.4391691394658754, "grad_norm": 0.5426668934382847, "learning_rate": 4.743495786002199e-05, "loss": 0.5745, "step": 444 }, { "epoch": 0.4401582591493571, "grad_norm": 0.5200388242525367, "learning_rate": 4.741663613045071e-05, "loss": 0.5863, "step": 445 }, { "epoch": 0.4411473788328388, "grad_norm": 1.3830554463390863, "learning_rate": 4.7398314400879445e-05, "loss": 0.5881, "step": 446 }, { "epoch": 0.4421364985163205, "grad_norm": 0.6948346194161272, "learning_rate": 4.737999267130817e-05, "loss": 0.5239, "step": 447 }, { "epoch": 0.4431256181998022, "grad_norm": 0.6601938762365581, "learning_rate": 4.7361670941736905e-05, "loss": 0.5487, "step": 448 }, { "epoch": 0.44411473788328387, "grad_norm": 0.5617566077718212, "learning_rate": 4.734334921216563e-05, "loss": 0.6214, "step": 449 }, { "epoch": 0.44510385756676557, "grad_norm": 0.732252290442145, "learning_rate": 4.732502748259436e-05, "loss": 0.5628, "step": 450 }, { "epoch": 0.44609297725024727, "grad_norm": 0.44306508264201677, "learning_rate": 4.730670575302309e-05, "loss": 0.6078, "step": 451 }, { "epoch": 0.44708209693372897, "grad_norm": 0.5296732642576735, "learning_rate": 4.728838402345182e-05, "loss": 0.5147, "step": 452 }, { "epoch": 0.44807121661721067, "grad_norm": 1.652456677052837, "learning_rate": 4.7270062293880544e-05, "loss": 0.5598, "step": 453 }, { "epoch": 0.44906033630069236, "grad_norm": 0.6144592738271647, "learning_rate": 4.725174056430927e-05, "loss": 0.51, "step": 454 }, { "epoch": 0.45004945598417406, "grad_norm": 0.4613783043555175, "learning_rate": 4.7233418834738e-05, "loss": 0.6202, "step": 455 }, { "epoch": 0.45103857566765576, "grad_norm": 0.5213448885231209, "learning_rate": 4.721509710516673e-05, "loss": 0.5308, "step": 456 }, { "epoch": 0.4520276953511375, "grad_norm": 0.46820252742016644, "learning_rate": 4.7196775375595456e-05, "loss": 0.5832, "step": 457 }, { "epoch": 0.4530168150346192, "grad_norm": 0.42474287967898067, "learning_rate": 4.717845364602418e-05, "loss": 0.5684, "step": 458 }, { "epoch": 0.4540059347181009, "grad_norm": 0.39392310575419126, "learning_rate": 4.7160131916452916e-05, "loss": 0.624, "step": 459 }, { "epoch": 0.4549950544015826, "grad_norm": 0.4175763227925975, "learning_rate": 4.714181018688165e-05, "loss": 0.6065, "step": 460 }, { "epoch": 0.4559841740850643, "grad_norm": 0.3800322746234243, "learning_rate": 4.7123488457310375e-05, "loss": 0.6242, "step": 461 }, { "epoch": 0.456973293768546, "grad_norm": 0.37365387127732047, "learning_rate": 4.71051667277391e-05, "loss": 0.6967, "step": 462 }, { "epoch": 0.4579624134520277, "grad_norm": 0.46031496253337684, "learning_rate": 4.708684499816783e-05, "loss": 0.5547, "step": 463 }, { "epoch": 0.4589515331355094, "grad_norm": 0.39156501494001045, "learning_rate": 4.706852326859656e-05, "loss": 0.5693, "step": 464 }, { "epoch": 0.4599406528189911, "grad_norm": 0.37353175638665764, "learning_rate": 4.705020153902529e-05, "loss": 0.587, "step": 465 }, { "epoch": 0.4609297725024728, "grad_norm": 0.43060709388709534, "learning_rate": 4.7031879809454014e-05, "loss": 0.5674, "step": 466 }, { "epoch": 0.4619188921859545, "grad_norm": 0.38735425009204794, "learning_rate": 4.701355807988274e-05, "loss": 0.6058, "step": 467 }, { "epoch": 0.4629080118694362, "grad_norm": 0.46266944891118184, "learning_rate": 4.6995236350311474e-05, "loss": 0.5652, "step": 468 }, { "epoch": 0.4638971315529179, "grad_norm": 0.36247469180107994, "learning_rate": 4.69769146207402e-05, "loss": 0.6043, "step": 469 }, { "epoch": 0.4648862512363996, "grad_norm": 0.4030557538436561, "learning_rate": 4.6958592891168927e-05, "loss": 0.6128, "step": 470 }, { "epoch": 0.4658753709198813, "grad_norm": 0.6033825885688812, "learning_rate": 4.694027116159766e-05, "loss": 0.5646, "step": 471 }, { "epoch": 0.466864490603363, "grad_norm": 0.37720757452451065, "learning_rate": 4.6921949432026386e-05, "loss": 0.5317, "step": 472 }, { "epoch": 0.4678536102868447, "grad_norm": 0.5333495762992678, "learning_rate": 4.690362770245512e-05, "loss": 0.6658, "step": 473 }, { "epoch": 0.4688427299703264, "grad_norm": 0.5133974898384014, "learning_rate": 4.688530597288384e-05, "loss": 0.5035, "step": 474 }, { "epoch": 0.4698318496538081, "grad_norm": 0.4539945208932187, "learning_rate": 4.686698424331257e-05, "loss": 0.5747, "step": 475 }, { "epoch": 0.4708209693372898, "grad_norm": 0.41564631661574264, "learning_rate": 4.68486625137413e-05, "loss": 0.4843, "step": 476 }, { "epoch": 0.47181008902077154, "grad_norm": 0.38138121789633134, "learning_rate": 4.683034078417003e-05, "loss": 0.5409, "step": 477 }, { "epoch": 0.47279920870425324, "grad_norm": 0.44871468038998025, "learning_rate": 4.681201905459875e-05, "loss": 0.5109, "step": 478 }, { "epoch": 0.47378832838773494, "grad_norm": 0.3177694456049873, "learning_rate": 4.6793697325027485e-05, "loss": 0.602, "step": 479 }, { "epoch": 0.47477744807121663, "grad_norm": 0.46257643128201287, "learning_rate": 4.677537559545621e-05, "loss": 0.6047, "step": 480 }, { "epoch": 0.47576656775469833, "grad_norm": 4.67269769452667, "learning_rate": 4.6757053865884944e-05, "loss": 0.8503, "step": 481 }, { "epoch": 0.47675568743818003, "grad_norm": 0.39836333155328146, "learning_rate": 4.673873213631367e-05, "loss": 0.6113, "step": 482 }, { "epoch": 0.47774480712166173, "grad_norm": 2.353788660145488, "learning_rate": 4.67204104067424e-05, "loss": 0.7275, "step": 483 }, { "epoch": 0.47873392680514343, "grad_norm": 0.601097344141485, "learning_rate": 4.670208867717113e-05, "loss": 0.6335, "step": 484 }, { "epoch": 0.4797230464886251, "grad_norm": 0.36883129234828055, "learning_rate": 4.668376694759986e-05, "loss": 0.6315, "step": 485 }, { "epoch": 0.4807121661721068, "grad_norm": 1.8909305038276623, "learning_rate": 4.666544521802858e-05, "loss": 0.7504, "step": 486 }, { "epoch": 0.4817012858555885, "grad_norm": 0.4591587176811992, "learning_rate": 4.664712348845731e-05, "loss": 0.5666, "step": 487 }, { "epoch": 0.4826904055390702, "grad_norm": 0.4144642652670824, "learning_rate": 4.662880175888604e-05, "loss": 0.561, "step": 488 }, { "epoch": 0.4836795252225519, "grad_norm": 5.896448976676767, "learning_rate": 4.661048002931477e-05, "loss": 0.9015, "step": 489 }, { "epoch": 0.4846686449060336, "grad_norm": 0.5720698016034343, "learning_rate": 4.65921582997435e-05, "loss": 0.6114, "step": 490 }, { "epoch": 0.4856577645895153, "grad_norm": 0.3679119804465383, "learning_rate": 4.657383657017222e-05, "loss": 0.5466, "step": 491 }, { "epoch": 0.486646884272997, "grad_norm": 0.44989689849502357, "learning_rate": 4.6555514840600955e-05, "loss": 0.5291, "step": 492 }, { "epoch": 0.4876360039564787, "grad_norm": 0.5024872015252905, "learning_rate": 4.653719311102968e-05, "loss": 0.6009, "step": 493 }, { "epoch": 0.4886251236399604, "grad_norm": 0.464562771119027, "learning_rate": 4.6518871381458415e-05, "loss": 0.5875, "step": 494 }, { "epoch": 0.4896142433234421, "grad_norm": 0.48167751471277875, "learning_rate": 4.650054965188714e-05, "loss": 0.4983, "step": 495 }, { "epoch": 0.49060336300692386, "grad_norm": 1.9507530661852923, "learning_rate": 4.648222792231587e-05, "loss": 0.6099, "step": 496 }, { "epoch": 0.49159248269040556, "grad_norm": 0.606821397983761, "learning_rate": 4.64639061927446e-05, "loss": 0.5899, "step": 497 }, { "epoch": 0.49258160237388726, "grad_norm": 0.533867569738737, "learning_rate": 4.644558446317333e-05, "loss": 0.6233, "step": 498 }, { "epoch": 0.49357072205736896, "grad_norm": 0.529229125116131, "learning_rate": 4.6427262733602054e-05, "loss": 0.5453, "step": 499 }, { "epoch": 0.49455984174085066, "grad_norm": 0.4916159848073325, "learning_rate": 4.640894100403078e-05, "loss": 0.5724, "step": 500 }, { "epoch": 0.49554896142433236, "grad_norm": 0.4818563677803172, "learning_rate": 4.639061927445951e-05, "loss": 0.5693, "step": 501 }, { "epoch": 0.49653808110781406, "grad_norm": 4.7169140468910316, "learning_rate": 4.637229754488824e-05, "loss": 0.6424, "step": 502 }, { "epoch": 0.49752720079129575, "grad_norm": 0.6812953196471956, "learning_rate": 4.6353975815316966e-05, "loss": 0.5695, "step": 503 }, { "epoch": 0.49851632047477745, "grad_norm": 0.4430846469861221, "learning_rate": 4.633565408574569e-05, "loss": 0.5638, "step": 504 }, { "epoch": 0.49950544015825915, "grad_norm": 0.5867096506625531, "learning_rate": 4.6317332356174426e-05, "loss": 0.577, "step": 505 }, { "epoch": 0.5004945598417408, "grad_norm": 0.9442304457268548, "learning_rate": 4.629901062660315e-05, "loss": 0.5029, "step": 506 }, { "epoch": 0.5014836795252225, "grad_norm": 0.5146057080865236, "learning_rate": 4.6280688897031885e-05, "loss": 0.5866, "step": 507 }, { "epoch": 0.5024727992087042, "grad_norm": 0.40718135381499504, "learning_rate": 4.626236716746061e-05, "loss": 0.5325, "step": 508 }, { "epoch": 0.503461918892186, "grad_norm": 0.5519103208111765, "learning_rate": 4.624404543788934e-05, "loss": 0.6358, "step": 509 }, { "epoch": 0.5044510385756676, "grad_norm": 0.43699948345174316, "learning_rate": 4.622572370831807e-05, "loss": 0.608, "step": 510 }, { "epoch": 0.5054401582591493, "grad_norm": 0.521982987910703, "learning_rate": 4.62074019787468e-05, "loss": 0.6314, "step": 511 }, { "epoch": 0.506429277942631, "grad_norm": 0.3869957567924524, "learning_rate": 4.6189080249175524e-05, "loss": 0.4741, "step": 512 }, { "epoch": 0.5074183976261127, "grad_norm": 0.4939706787894611, "learning_rate": 4.617075851960425e-05, "loss": 0.562, "step": 513 }, { "epoch": 0.5084075173095944, "grad_norm": 0.4173292639174069, "learning_rate": 4.6152436790032984e-05, "loss": 0.5002, "step": 514 }, { "epoch": 0.5093966369930761, "grad_norm": 0.4096494198363875, "learning_rate": 4.613411506046171e-05, "loss": 0.4885, "step": 515 }, { "epoch": 0.5103857566765578, "grad_norm": 9.858981341609425, "learning_rate": 4.6115793330890437e-05, "loss": 0.685, "step": 516 }, { "epoch": 0.5113748763600395, "grad_norm": 0.5195002022290981, "learning_rate": 4.609747160131916e-05, "loss": 0.5369, "step": 517 }, { "epoch": 0.5123639960435212, "grad_norm": 0.3934474506050803, "learning_rate": 4.6079149871747896e-05, "loss": 0.5853, "step": 518 }, { "epoch": 0.5133531157270029, "grad_norm": 0.4134221147318054, "learning_rate": 4.606082814217663e-05, "loss": 0.5617, "step": 519 }, { "epoch": 0.5143422354104846, "grad_norm": 0.42829816283403555, "learning_rate": 4.604250641260535e-05, "loss": 0.5531, "step": 520 }, { "epoch": 0.5153313550939663, "grad_norm": 0.48752740238717335, "learning_rate": 4.602418468303408e-05, "loss": 0.606, "step": 521 }, { "epoch": 0.516320474777448, "grad_norm": 0.38538760178318904, "learning_rate": 4.600586295346281e-05, "loss": 0.5335, "step": 522 }, { "epoch": 0.5173095944609297, "grad_norm": 0.3936884719789974, "learning_rate": 4.598754122389154e-05, "loss": 0.5447, "step": 523 }, { "epoch": 0.5182987141444114, "grad_norm": 0.4059453713559687, "learning_rate": 4.596921949432026e-05, "loss": 0.4842, "step": 524 }, { "epoch": 0.5192878338278932, "grad_norm": 1.5542942915947051, "learning_rate": 4.5950897764748995e-05, "loss": 0.6171, "step": 525 }, { "epoch": 0.5202769535113749, "grad_norm": 1.5720532397067744, "learning_rate": 4.593257603517772e-05, "loss": 0.5877, "step": 526 }, { "epoch": 0.5212660731948566, "grad_norm": 2.9326347862637765, "learning_rate": 4.5914254305606454e-05, "loss": 0.6602, "step": 527 }, { "epoch": 0.5222551928783383, "grad_norm": 0.40876849501610224, "learning_rate": 4.589593257603518e-05, "loss": 0.4963, "step": 528 }, { "epoch": 0.52324431256182, "grad_norm": 0.43764445066833524, "learning_rate": 4.587761084646391e-05, "loss": 0.5417, "step": 529 }, { "epoch": 0.5242334322453017, "grad_norm": 0.40565909079834533, "learning_rate": 4.5859289116892634e-05, "loss": 0.576, "step": 530 }, { "epoch": 0.5252225519287834, "grad_norm": 0.4217327834248698, "learning_rate": 4.584096738732137e-05, "loss": 0.6196, "step": 531 }, { "epoch": 0.5262116716122651, "grad_norm": 0.46160735689946497, "learning_rate": 4.582264565775009e-05, "loss": 0.5486, "step": 532 }, { "epoch": 0.5272007912957468, "grad_norm": 0.4032237658743589, "learning_rate": 4.580432392817882e-05, "loss": 0.5626, "step": 533 }, { "epoch": 0.5281899109792285, "grad_norm": 0.5635928214859514, "learning_rate": 4.578600219860755e-05, "loss": 0.6173, "step": 534 }, { "epoch": 0.5291790306627102, "grad_norm": 0.6342796371814479, "learning_rate": 4.576768046903628e-05, "loss": 0.6277, "step": 535 }, { "epoch": 0.5301681503461919, "grad_norm": 0.4677423139160351, "learning_rate": 4.574935873946501e-05, "loss": 0.5694, "step": 536 }, { "epoch": 0.5311572700296736, "grad_norm": 0.40238302891813643, "learning_rate": 4.573103700989373e-05, "loss": 0.6217, "step": 537 }, { "epoch": 0.5321463897131553, "grad_norm": 0.504563589844548, "learning_rate": 4.5712715280322465e-05, "loss": 0.6222, "step": 538 }, { "epoch": 0.533135509396637, "grad_norm": 0.42246468004279764, "learning_rate": 4.569439355075119e-05, "loss": 0.5684, "step": 539 }, { "epoch": 0.5341246290801187, "grad_norm": 0.40403558987941973, "learning_rate": 4.5676071821179925e-05, "loss": 0.558, "step": 540 }, { "epoch": 0.5351137487636004, "grad_norm": 0.44549737299183073, "learning_rate": 4.5657750091608644e-05, "loss": 0.5609, "step": 541 }, { "epoch": 0.5361028684470821, "grad_norm": 20.81218570435005, "learning_rate": 4.563942836203738e-05, "loss": 0.9656, "step": 542 }, { "epoch": 0.5370919881305638, "grad_norm": 0.4727615722395577, "learning_rate": 4.562110663246611e-05, "loss": 0.4987, "step": 543 }, { "epoch": 0.5380811078140455, "grad_norm": 0.3679816340175357, "learning_rate": 4.560278490289484e-05, "loss": 0.5986, "step": 544 }, { "epoch": 0.5390702274975272, "grad_norm": 0.4198111058431937, "learning_rate": 4.5584463173323564e-05, "loss": 0.6406, "step": 545 }, { "epoch": 0.5400593471810089, "grad_norm": 0.381705361347438, "learning_rate": 4.556614144375229e-05, "loss": 0.5868, "step": 546 }, { "epoch": 0.5410484668644906, "grad_norm": 0.36128173218405524, "learning_rate": 4.554781971418102e-05, "loss": 0.5606, "step": 547 }, { "epoch": 0.5420375865479723, "grad_norm": 0.36624104089112686, "learning_rate": 4.552949798460975e-05, "loss": 0.5441, "step": 548 }, { "epoch": 0.543026706231454, "grad_norm": 2.643110350633743, "learning_rate": 4.5511176255038476e-05, "loss": 0.6998, "step": 549 }, { "epoch": 0.5440158259149357, "grad_norm": 0.45802173595768614, "learning_rate": 4.54928545254672e-05, "loss": 0.6603, "step": 550 }, { "epoch": 0.5450049455984174, "grad_norm": 0.39506056655492705, "learning_rate": 4.5474532795895936e-05, "loss": 0.5296, "step": 551 }, { "epoch": 0.5459940652818991, "grad_norm": 0.45358696386112984, "learning_rate": 4.545621106632466e-05, "loss": 0.5913, "step": 552 }, { "epoch": 0.5469831849653808, "grad_norm": 0.8759613406131889, "learning_rate": 4.5437889336753395e-05, "loss": 0.5732, "step": 553 }, { "epoch": 0.5479723046488625, "grad_norm": 0.5560376705190068, "learning_rate": 4.5419567607182115e-05, "loss": 0.5513, "step": 554 }, { "epoch": 0.5489614243323442, "grad_norm": 0.46334900525439865, "learning_rate": 4.540124587761085e-05, "loss": 0.5847, "step": 555 }, { "epoch": 0.5499505440158259, "grad_norm": 0.5044951628643652, "learning_rate": 4.538292414803958e-05, "loss": 0.6368, "step": 556 }, { "epoch": 0.5509396636993076, "grad_norm": 0.44658793634543775, "learning_rate": 4.536460241846831e-05, "loss": 0.5482, "step": 557 }, { "epoch": 0.5519287833827893, "grad_norm": 0.4668038982374279, "learning_rate": 4.5346280688897034e-05, "loss": 0.6158, "step": 558 }, { "epoch": 0.552917903066271, "grad_norm": 0.48096355582962474, "learning_rate": 4.532795895932576e-05, "loss": 0.57, "step": 559 }, { "epoch": 0.5539070227497527, "grad_norm": 0.4897334267462889, "learning_rate": 4.5309637229754494e-05, "loss": 0.552, "step": 560 }, { "epoch": 0.5548961424332344, "grad_norm": 0.4185534474944087, "learning_rate": 4.529131550018322e-05, "loss": 0.5191, "step": 561 }, { "epoch": 0.5558852621167161, "grad_norm": 0.5365259911263918, "learning_rate": 4.527299377061195e-05, "loss": 0.5084, "step": 562 }, { "epoch": 0.5568743818001978, "grad_norm": 0.4215213687101347, "learning_rate": 4.525467204104067e-05, "loss": 0.6025, "step": 563 }, { "epoch": 0.5578635014836796, "grad_norm": 0.4740697064883748, "learning_rate": 4.5236350311469406e-05, "loss": 0.4966, "step": 564 }, { "epoch": 0.5588526211671613, "grad_norm": 0.44324661986533054, "learning_rate": 4.521802858189813e-05, "loss": 0.566, "step": 565 }, { "epoch": 0.559841740850643, "grad_norm": 2.2285546984735856, "learning_rate": 4.519970685232686e-05, "loss": 0.7091, "step": 566 }, { "epoch": 0.5608308605341247, "grad_norm": 0.5556074562961142, "learning_rate": 4.518138512275559e-05, "loss": 0.594, "step": 567 }, { "epoch": 0.5618199802176064, "grad_norm": 0.4500464488699885, "learning_rate": 4.516306339318432e-05, "loss": 0.5656, "step": 568 }, { "epoch": 0.5628090999010881, "grad_norm": 0.3512247907513221, "learning_rate": 4.514474166361305e-05, "loss": 0.5336, "step": 569 }, { "epoch": 0.5637982195845698, "grad_norm": 0.4304687263152149, "learning_rate": 4.512641993404177e-05, "loss": 0.5865, "step": 570 }, { "epoch": 0.5647873392680515, "grad_norm": 0.4521665914106063, "learning_rate": 4.5108098204470505e-05, "loss": 0.5873, "step": 571 }, { "epoch": 0.5657764589515332, "grad_norm": 0.4159456344130283, "learning_rate": 4.508977647489923e-05, "loss": 0.5975, "step": 572 }, { "epoch": 0.5667655786350149, "grad_norm": 0.5296905855219289, "learning_rate": 4.5071454745327964e-05, "loss": 0.5955, "step": 573 }, { "epoch": 0.5677546983184966, "grad_norm": 0.4014808388631957, "learning_rate": 4.505313301575669e-05, "loss": 0.5433, "step": 574 }, { "epoch": 0.5687438180019783, "grad_norm": 0.4319671517552426, "learning_rate": 4.503481128618542e-05, "loss": 0.4904, "step": 575 }, { "epoch": 0.56973293768546, "grad_norm": 3.6175278656140404, "learning_rate": 4.5016489556614144e-05, "loss": 0.6217, "step": 576 }, { "epoch": 0.5707220573689417, "grad_norm": 0.42678930653486047, "learning_rate": 4.499816782704288e-05, "loss": 0.5212, "step": 577 }, { "epoch": 0.5717111770524234, "grad_norm": 0.4066859848927131, "learning_rate": 4.49798460974716e-05, "loss": 0.542, "step": 578 }, { "epoch": 0.5727002967359051, "grad_norm": 0.4150317023868254, "learning_rate": 4.496152436790033e-05, "loss": 0.5509, "step": 579 }, { "epoch": 0.5736894164193868, "grad_norm": 0.39696777501880237, "learning_rate": 4.494320263832906e-05, "loss": 0.6286, "step": 580 }, { "epoch": 0.5746785361028685, "grad_norm": 0.42488826730568463, "learning_rate": 4.492488090875779e-05, "loss": 0.5245, "step": 581 }, { "epoch": 0.5756676557863502, "grad_norm": 0.3832259003808749, "learning_rate": 4.490655917918652e-05, "loss": 0.513, "step": 582 }, { "epoch": 0.5766567754698319, "grad_norm": 0.37765307913240526, "learning_rate": 4.488823744961524e-05, "loss": 0.5797, "step": 583 }, { "epoch": 0.5776458951533135, "grad_norm": 0.5243524846267854, "learning_rate": 4.4869915720043975e-05, "loss": 0.5494, "step": 584 }, { "epoch": 0.5786350148367952, "grad_norm": 0.35608241349020836, "learning_rate": 4.48515939904727e-05, "loss": 0.5169, "step": 585 }, { "epoch": 0.579624134520277, "grad_norm": 0.5072400729075438, "learning_rate": 4.4833272260901435e-05, "loss": 0.5375, "step": 586 }, { "epoch": 0.5806132542037586, "grad_norm": 0.3887888599949883, "learning_rate": 4.4814950531330155e-05, "loss": 0.5928, "step": 587 }, { "epoch": 0.5816023738872403, "grad_norm": 0.38637277802524667, "learning_rate": 4.479662880175889e-05, "loss": 0.5631, "step": 588 }, { "epoch": 0.582591493570722, "grad_norm": 0.4544815503207846, "learning_rate": 4.4778307072187614e-05, "loss": 0.5284, "step": 589 }, { "epoch": 0.5835806132542037, "grad_norm": 0.44860638139409253, "learning_rate": 4.475998534261635e-05, "loss": 0.5568, "step": 590 }, { "epoch": 0.5845697329376854, "grad_norm": 0.34783304628715694, "learning_rate": 4.4741663613045074e-05, "loss": 0.4736, "step": 591 }, { "epoch": 0.5855588526211671, "grad_norm": 0.46635755164532483, "learning_rate": 4.47233418834738e-05, "loss": 0.5721, "step": 592 }, { "epoch": 0.5865479723046488, "grad_norm": 0.35836770919240934, "learning_rate": 4.4705020153902533e-05, "loss": 0.5742, "step": 593 }, { "epoch": 0.5875370919881305, "grad_norm": 2.1646391480973386, "learning_rate": 4.468669842433126e-05, "loss": 0.7749, "step": 594 }, { "epoch": 0.5885262116716122, "grad_norm": 0.6322154073368694, "learning_rate": 4.4668376694759986e-05, "loss": 0.5222, "step": 595 }, { "epoch": 0.5895153313550939, "grad_norm": 0.3447318173329335, "learning_rate": 4.465005496518871e-05, "loss": 0.5024, "step": 596 }, { "epoch": 0.5905044510385756, "grad_norm": 0.5950854440009333, "learning_rate": 4.4631733235617446e-05, "loss": 0.6533, "step": 597 }, { "epoch": 0.5914935707220573, "grad_norm": 0.521884110600529, "learning_rate": 4.461341150604617e-05, "loss": 0.5968, "step": 598 }, { "epoch": 0.592482690405539, "grad_norm": 0.3392872051376154, "learning_rate": 4.45950897764749e-05, "loss": 0.5776, "step": 599 }, { "epoch": 0.5934718100890207, "grad_norm": 0.5377239708646002, "learning_rate": 4.4576768046903625e-05, "loss": 0.6145, "step": 600 }, { "epoch": 0.5944609297725024, "grad_norm": 0.4318047502673512, "learning_rate": 4.455844631733236e-05, "loss": 0.6059, "step": 601 }, { "epoch": 0.5954500494559841, "grad_norm": 2.3631173625344735, "learning_rate": 4.454012458776109e-05, "loss": 0.779, "step": 602 }, { "epoch": 0.5964391691394659, "grad_norm": 0.5336870304548778, "learning_rate": 4.452180285818982e-05, "loss": 0.5058, "step": 603 }, { "epoch": 0.5974282888229476, "grad_norm": 0.4034031794946735, "learning_rate": 4.4503481128618544e-05, "loss": 0.5717, "step": 604 }, { "epoch": 0.5984174085064293, "grad_norm": 0.37891278714404253, "learning_rate": 4.448515939904727e-05, "loss": 0.5299, "step": 605 }, { "epoch": 0.599406528189911, "grad_norm": 1.6328876825145693, "learning_rate": 4.4466837669476004e-05, "loss": 0.7685, "step": 606 }, { "epoch": 0.6003956478733927, "grad_norm": 0.4889387354780191, "learning_rate": 4.444851593990473e-05, "loss": 0.5767, "step": 607 }, { "epoch": 0.6013847675568744, "grad_norm": 0.39614179421575757, "learning_rate": 4.443019421033346e-05, "loss": 0.5488, "step": 608 }, { "epoch": 0.6023738872403561, "grad_norm": 0.4660736811721914, "learning_rate": 4.441187248076218e-05, "loss": 0.5036, "step": 609 }, { "epoch": 0.6033630069238378, "grad_norm": 0.4305830100351165, "learning_rate": 4.4393550751190916e-05, "loss": 0.5586, "step": 610 }, { "epoch": 0.6043521266073195, "grad_norm": 0.42857086912608233, "learning_rate": 4.437522902161964e-05, "loss": 0.5373, "step": 611 }, { "epoch": 0.6053412462908012, "grad_norm": 0.4259812924670851, "learning_rate": 4.435690729204837e-05, "loss": 0.6364, "step": 612 }, { "epoch": 0.6063303659742829, "grad_norm": 0.35384845137297105, "learning_rate": 4.4338585562477096e-05, "loss": 0.5686, "step": 613 }, { "epoch": 0.6073194856577646, "grad_norm": 0.43840991923734457, "learning_rate": 4.432026383290583e-05, "loss": 0.5664, "step": 614 }, { "epoch": 0.6083086053412463, "grad_norm": 0.4160175071465533, "learning_rate": 4.430194210333456e-05, "loss": 0.5562, "step": 615 }, { "epoch": 0.609297725024728, "grad_norm": 0.4076622693219068, "learning_rate": 4.428362037376328e-05, "loss": 0.5819, "step": 616 }, { "epoch": 0.6102868447082097, "grad_norm": 0.43491508340384, "learning_rate": 4.4265298644192015e-05, "loss": 0.6215, "step": 617 }, { "epoch": 0.6112759643916914, "grad_norm": 0.9665467383337245, "learning_rate": 4.424697691462074e-05, "loss": 0.6142, "step": 618 }, { "epoch": 0.6122650840751731, "grad_norm": 0.3867966415220155, "learning_rate": 4.4228655185049474e-05, "loss": 0.5207, "step": 619 }, { "epoch": 0.6132542037586548, "grad_norm": 0.3384128186488838, "learning_rate": 4.42103334554782e-05, "loss": 0.5391, "step": 620 }, { "epoch": 0.6142433234421365, "grad_norm": 0.37030112577139734, "learning_rate": 4.419201172590693e-05, "loss": 0.5357, "step": 621 }, { "epoch": 0.6152324431256182, "grad_norm": 0.33532983716976167, "learning_rate": 4.4173689996335654e-05, "loss": 0.5596, "step": 622 }, { "epoch": 0.6162215628090999, "grad_norm": 2.1912897019885156, "learning_rate": 4.415536826676439e-05, "loss": 0.663, "step": 623 }, { "epoch": 0.6172106824925816, "grad_norm": 0.4108780719007618, "learning_rate": 4.413704653719311e-05, "loss": 0.5381, "step": 624 }, { "epoch": 0.6181998021760633, "grad_norm": 0.345525682169045, "learning_rate": 4.411872480762184e-05, "loss": 0.6086, "step": 625 }, { "epoch": 0.619188921859545, "grad_norm": 0.32145894628431493, "learning_rate": 4.410040307805057e-05, "loss": 0.5079, "step": 626 }, { "epoch": 0.6201780415430267, "grad_norm": 0.3400074427817454, "learning_rate": 4.40820813484793e-05, "loss": 0.556, "step": 627 }, { "epoch": 0.6211671612265084, "grad_norm": 0.336784464756408, "learning_rate": 4.406375961890803e-05, "loss": 0.5383, "step": 628 }, { "epoch": 0.6221562809099901, "grad_norm": 0.36068226884339905, "learning_rate": 4.404543788933675e-05, "loss": 0.5838, "step": 629 }, { "epoch": 0.6231454005934718, "grad_norm": 0.35409438086231915, "learning_rate": 4.4027116159765485e-05, "loss": 0.4777, "step": 630 }, { "epoch": 0.6241345202769535, "grad_norm": 0.3726791740087236, "learning_rate": 4.400879443019421e-05, "loss": 0.6175, "step": 631 }, { "epoch": 0.6251236399604352, "grad_norm": 0.4023806669048874, "learning_rate": 4.3990472700622945e-05, "loss": 0.6292, "step": 632 }, { "epoch": 0.6261127596439169, "grad_norm": 0.34330582894277495, "learning_rate": 4.3972150971051665e-05, "loss": 0.4987, "step": 633 }, { "epoch": 0.6271018793273986, "grad_norm": 0.3305730900775558, "learning_rate": 4.39538292414804e-05, "loss": 0.5235, "step": 634 }, { "epoch": 0.6280909990108803, "grad_norm": 0.36092394055577376, "learning_rate": 4.3935507511909124e-05, "loss": 0.5585, "step": 635 }, { "epoch": 0.629080118694362, "grad_norm": 0.4093047909780486, "learning_rate": 4.391718578233786e-05, "loss": 0.5418, "step": 636 }, { "epoch": 0.6300692383778437, "grad_norm": 0.33645740331275986, "learning_rate": 4.3898864052766584e-05, "loss": 0.4971, "step": 637 }, { "epoch": 0.6310583580613254, "grad_norm": 0.3611452319329633, "learning_rate": 4.388054232319531e-05, "loss": 0.5567, "step": 638 }, { "epoch": 0.6320474777448071, "grad_norm": 0.4930946827145663, "learning_rate": 4.3862220593624043e-05, "loss": 0.5986, "step": 639 }, { "epoch": 0.6330365974282888, "grad_norm": 0.3231177239659706, "learning_rate": 4.384389886405277e-05, "loss": 0.5975, "step": 640 }, { "epoch": 0.6340257171117705, "grad_norm": 0.3827959704241396, "learning_rate": 4.3825577134481496e-05, "loss": 0.5177, "step": 641 }, { "epoch": 0.6350148367952523, "grad_norm": 0.41977721218388675, "learning_rate": 4.380725540491022e-05, "loss": 0.6479, "step": 642 }, { "epoch": 0.636003956478734, "grad_norm": 0.3662099933871658, "learning_rate": 4.3788933675338956e-05, "loss": 0.6163, "step": 643 }, { "epoch": 0.6369930761622157, "grad_norm": 0.3712913474912623, "learning_rate": 4.377061194576768e-05, "loss": 0.4442, "step": 644 }, { "epoch": 0.6379821958456974, "grad_norm": 0.5071826500684371, "learning_rate": 4.375229021619641e-05, "loss": 0.5606, "step": 645 }, { "epoch": 0.6389713155291791, "grad_norm": 0.35640208864347855, "learning_rate": 4.3733968486625135e-05, "loss": 0.505, "step": 646 }, { "epoch": 0.6399604352126608, "grad_norm": 0.4223414729847192, "learning_rate": 4.371564675705387e-05, "loss": 0.5597, "step": 647 }, { "epoch": 0.6409495548961425, "grad_norm": 0.524278275869879, "learning_rate": 4.3697325027482595e-05, "loss": 0.5428, "step": 648 }, { "epoch": 0.6419386745796242, "grad_norm": 0.34090519496656335, "learning_rate": 4.367900329791133e-05, "loss": 0.5068, "step": 649 }, { "epoch": 0.6429277942631059, "grad_norm": 0.4974539026917071, "learning_rate": 4.3660681568340054e-05, "loss": 0.5449, "step": 650 }, { "epoch": 0.6439169139465876, "grad_norm": 0.4263296924702846, "learning_rate": 4.364235983876878e-05, "loss": 0.5863, "step": 651 }, { "epoch": 0.6449060336300693, "grad_norm": 0.36550014682885573, "learning_rate": 4.3624038109197514e-05, "loss": 0.5139, "step": 652 }, { "epoch": 0.645895153313551, "grad_norm": 0.4420805717206251, "learning_rate": 4.360571637962624e-05, "loss": 0.4919, "step": 653 }, { "epoch": 0.6468842729970327, "grad_norm": 3.5175564338439544, "learning_rate": 4.358739465005497e-05, "loss": 0.6138, "step": 654 }, { "epoch": 0.6478733926805144, "grad_norm": 0.44353550687361537, "learning_rate": 4.356907292048369e-05, "loss": 0.5678, "step": 655 }, { "epoch": 0.6488625123639961, "grad_norm": 0.42907631403450497, "learning_rate": 4.3550751190912426e-05, "loss": 0.5126, "step": 656 }, { "epoch": 0.6498516320474778, "grad_norm": 0.5224325181644863, "learning_rate": 4.353242946134115e-05, "loss": 0.567, "step": 657 }, { "epoch": 0.6508407517309595, "grad_norm": 0.36393145929326026, "learning_rate": 4.351410773176988e-05, "loss": 0.5758, "step": 658 }, { "epoch": 0.6518298714144412, "grad_norm": 0.43831662678367533, "learning_rate": 4.3495786002198606e-05, "loss": 0.4662, "step": 659 }, { "epoch": 0.6528189910979229, "grad_norm": 0.44731721791336854, "learning_rate": 4.347746427262734e-05, "loss": 0.5162, "step": 660 }, { "epoch": 0.6538081107814046, "grad_norm": 0.3625805902395787, "learning_rate": 4.3459142543056065e-05, "loss": 0.5866, "step": 661 }, { "epoch": 0.6547972304648862, "grad_norm": 0.374836185068099, "learning_rate": 4.344082081348479e-05, "loss": 0.5622, "step": 662 }, { "epoch": 0.655786350148368, "grad_norm": 0.3636250559467318, "learning_rate": 4.3422499083913525e-05, "loss": 0.4704, "step": 663 }, { "epoch": 0.6567754698318496, "grad_norm": 0.368172907844344, "learning_rate": 4.340417735434225e-05, "loss": 0.4582, "step": 664 }, { "epoch": 0.6577645895153313, "grad_norm": 0.3781176871717141, "learning_rate": 4.3385855624770985e-05, "loss": 0.5608, "step": 665 }, { "epoch": 0.658753709198813, "grad_norm": 0.3472496247475107, "learning_rate": 4.336753389519971e-05, "loss": 0.5267, "step": 666 }, { "epoch": 0.6597428288822947, "grad_norm": 0.35710749282494153, "learning_rate": 4.334921216562844e-05, "loss": 0.5912, "step": 667 }, { "epoch": 0.6607319485657764, "grad_norm": 0.34177842883688003, "learning_rate": 4.3330890436057164e-05, "loss": 0.5239, "step": 668 }, { "epoch": 0.6617210682492581, "grad_norm": 0.45206446669886513, "learning_rate": 4.33125687064859e-05, "loss": 0.6048, "step": 669 }, { "epoch": 0.6627101879327398, "grad_norm": 0.43477515611045725, "learning_rate": 4.3294246976914623e-05, "loss": 0.5396, "step": 670 }, { "epoch": 0.6636993076162215, "grad_norm": 0.4495761387137212, "learning_rate": 4.327592524734335e-05, "loss": 0.5174, "step": 671 }, { "epoch": 0.6646884272997032, "grad_norm": 1.8122069786779942, "learning_rate": 4.3257603517772076e-05, "loss": 0.5415, "step": 672 }, { "epoch": 0.6656775469831849, "grad_norm": 0.54700387793728, "learning_rate": 4.323928178820081e-05, "loss": 0.5466, "step": 673 }, { "epoch": 0.6666666666666666, "grad_norm": 0.3497883391162035, "learning_rate": 4.322096005862954e-05, "loss": 0.5633, "step": 674 }, { "epoch": 0.6676557863501483, "grad_norm": 0.45126911871908887, "learning_rate": 4.320263832905826e-05, "loss": 0.5531, "step": 675 }, { "epoch": 0.66864490603363, "grad_norm": 0.3951817778962431, "learning_rate": 4.3184316599486995e-05, "loss": 0.5225, "step": 676 }, { "epoch": 0.6696340257171117, "grad_norm": 0.5240280173507978, "learning_rate": 4.316599486991572e-05, "loss": 0.5278, "step": 677 }, { "epoch": 0.6706231454005934, "grad_norm": 0.3375650680362749, "learning_rate": 4.3147673140344455e-05, "loss": 0.5464, "step": 678 }, { "epoch": 0.6716122650840751, "grad_norm": 0.41081775932232456, "learning_rate": 4.3129351410773175e-05, "loss": 0.5289, "step": 679 }, { "epoch": 0.6726013847675568, "grad_norm": 0.42912334923955686, "learning_rate": 4.311102968120191e-05, "loss": 0.5685, "step": 680 }, { "epoch": 0.6735905044510386, "grad_norm": 0.44709522287251285, "learning_rate": 4.3092707951630634e-05, "loss": 0.5861, "step": 681 }, { "epoch": 0.6745796241345203, "grad_norm": 8.932319607989578, "learning_rate": 4.307438622205937e-05, "loss": 0.5878, "step": 682 }, { "epoch": 0.675568743818002, "grad_norm": 0.6163038315017687, "learning_rate": 4.3056064492488094e-05, "loss": 0.552, "step": 683 }, { "epoch": 0.6765578635014837, "grad_norm": 0.4130993321464594, "learning_rate": 4.303774276291682e-05, "loss": 0.484, "step": 684 }, { "epoch": 0.6775469831849654, "grad_norm": 0.44149481672035984, "learning_rate": 4.301942103334555e-05, "loss": 0.487, "step": 685 }, { "epoch": 0.6785361028684471, "grad_norm": 0.44270474817714917, "learning_rate": 4.300109930377428e-05, "loss": 0.4829, "step": 686 }, { "epoch": 0.6795252225519288, "grad_norm": 1.778964998297047, "learning_rate": 4.2982777574203006e-05, "loss": 0.6008, "step": 687 }, { "epoch": 0.6805143422354105, "grad_norm": 0.5042046907833749, "learning_rate": 4.296445584463173e-05, "loss": 0.5508, "step": 688 }, { "epoch": 0.6815034619188922, "grad_norm": 1.2364847410894697, "learning_rate": 4.2946134115060466e-05, "loss": 0.4864, "step": 689 }, { "epoch": 0.6824925816023739, "grad_norm": 5.675707936422887, "learning_rate": 4.292781238548919e-05, "loss": 0.5267, "step": 690 }, { "epoch": 0.6834817012858556, "grad_norm": 1.0113014216945861, "learning_rate": 4.290949065591792e-05, "loss": 0.6003, "step": 691 }, { "epoch": 0.6844708209693373, "grad_norm": 0.7307635089929869, "learning_rate": 4.2891168926346645e-05, "loss": 0.5435, "step": 692 }, { "epoch": 0.685459940652819, "grad_norm": 1.005536324793941, "learning_rate": 4.287284719677538e-05, "loss": 0.5953, "step": 693 }, { "epoch": 0.6864490603363007, "grad_norm": 0.8795150549550624, "learning_rate": 4.2854525467204105e-05, "loss": 0.5776, "step": 694 }, { "epoch": 0.6874381800197824, "grad_norm": 0.9217585885833345, "learning_rate": 4.283620373763284e-05, "loss": 0.4713, "step": 695 }, { "epoch": 0.6884272997032641, "grad_norm": 0.5750818575992418, "learning_rate": 4.281788200806156e-05, "loss": 0.5646, "step": 696 }, { "epoch": 0.6894164193867458, "grad_norm": 1.1839342901013068, "learning_rate": 4.279956027849029e-05, "loss": 0.5912, "step": 697 }, { "epoch": 0.6904055390702275, "grad_norm": 0.40639325595326, "learning_rate": 4.2781238548919024e-05, "loss": 0.5001, "step": 698 }, { "epoch": 0.6913946587537092, "grad_norm": 0.9197610387321684, "learning_rate": 4.276291681934775e-05, "loss": 0.5174, "step": 699 }, { "epoch": 0.6923837784371909, "grad_norm": 2.2435633788044482, "learning_rate": 4.274459508977648e-05, "loss": 0.6464, "step": 700 }, { "epoch": 0.6933728981206726, "grad_norm": 2.2636363371000616, "learning_rate": 4.27262733602052e-05, "loss": 0.6616, "step": 701 }, { "epoch": 0.6943620178041543, "grad_norm": 1.0957925072649075, "learning_rate": 4.2707951630633937e-05, "loss": 0.5103, "step": 702 }, { "epoch": 0.695351137487636, "grad_norm": 0.44136767500918594, "learning_rate": 4.268962990106266e-05, "loss": 0.5773, "step": 703 }, { "epoch": 0.6963402571711177, "grad_norm": 1.2119093160665309, "learning_rate": 4.267130817149139e-05, "loss": 0.5676, "step": 704 }, { "epoch": 0.6973293768545994, "grad_norm": 0.9844741497724914, "learning_rate": 4.2652986441920116e-05, "loss": 0.5346, "step": 705 }, { "epoch": 0.6983184965380811, "grad_norm": 0.9825434500331776, "learning_rate": 4.263466471234885e-05, "loss": 0.579, "step": 706 }, { "epoch": 0.6993076162215628, "grad_norm": 1.0080629257558236, "learning_rate": 4.2616342982777575e-05, "loss": 0.5322, "step": 707 }, { "epoch": 0.7002967359050445, "grad_norm": 0.6306084604291936, "learning_rate": 4.25980212532063e-05, "loss": 0.6135, "step": 708 }, { "epoch": 0.7012858555885262, "grad_norm": 0.7935850704567368, "learning_rate": 4.257969952363503e-05, "loss": 0.4881, "step": 709 }, { "epoch": 0.7022749752720079, "grad_norm": 0.42031411509137817, "learning_rate": 4.256137779406376e-05, "loss": 0.5627, "step": 710 }, { "epoch": 0.7032640949554896, "grad_norm": 0.6431022906724274, "learning_rate": 4.2543056064492495e-05, "loss": 0.5521, "step": 711 }, { "epoch": 0.7042532146389713, "grad_norm": 0.4051950629961147, "learning_rate": 4.252473433492122e-05, "loss": 0.539, "step": 712 }, { "epoch": 0.705242334322453, "grad_norm": 0.5504001902905677, "learning_rate": 4.250641260534995e-05, "loss": 0.6014, "step": 713 }, { "epoch": 0.7062314540059347, "grad_norm": 0.42943927228102546, "learning_rate": 4.2488090875778674e-05, "loss": 0.5071, "step": 714 }, { "epoch": 0.7072205736894164, "grad_norm": 0.4564346346810455, "learning_rate": 4.246976914620741e-05, "loss": 0.5798, "step": 715 }, { "epoch": 0.7082096933728981, "grad_norm": 0.41971345322735765, "learning_rate": 4.2451447416636133e-05, "loss": 0.4746, "step": 716 }, { "epoch": 0.7091988130563798, "grad_norm": 0.4767124000008028, "learning_rate": 4.243312568706486e-05, "loss": 0.5221, "step": 717 }, { "epoch": 0.7101879327398615, "grad_norm": 0.473920208531501, "learning_rate": 4.2414803957493586e-05, "loss": 0.6092, "step": 718 }, { "epoch": 0.7111770524233432, "grad_norm": 0.4008182380340919, "learning_rate": 4.239648222792232e-05, "loss": 0.5633, "step": 719 }, { "epoch": 0.712166172106825, "grad_norm": 0.4572650059707555, "learning_rate": 4.2378160498351046e-05, "loss": 0.4485, "step": 720 }, { "epoch": 0.7131552917903067, "grad_norm": 0.3982981440820982, "learning_rate": 4.235983876877977e-05, "loss": 0.5882, "step": 721 }, { "epoch": 0.7141444114737884, "grad_norm": 0.4445315839865244, "learning_rate": 4.2341517039208506e-05, "loss": 0.499, "step": 722 }, { "epoch": 0.7151335311572701, "grad_norm": 0.45327248020817545, "learning_rate": 4.232319530963723e-05, "loss": 0.5828, "step": 723 }, { "epoch": 0.7161226508407518, "grad_norm": 0.45072469495117606, "learning_rate": 4.2304873580065965e-05, "loss": 0.5684, "step": 724 }, { "epoch": 0.7171117705242335, "grad_norm": 0.5030033920719464, "learning_rate": 4.2286551850494685e-05, "loss": 0.5462, "step": 725 }, { "epoch": 0.7181008902077152, "grad_norm": 0.41612647044276724, "learning_rate": 4.226823012092342e-05, "loss": 0.5931, "step": 726 }, { "epoch": 0.7190900098911969, "grad_norm": 0.48469080484439975, "learning_rate": 4.2249908391352144e-05, "loss": 0.5663, "step": 727 }, { "epoch": 0.7200791295746786, "grad_norm": 0.3288997249334829, "learning_rate": 4.223158666178088e-05, "loss": 0.5097, "step": 728 }, { "epoch": 0.7210682492581603, "grad_norm": 0.4306446848182276, "learning_rate": 4.2213264932209604e-05, "loss": 0.5869, "step": 729 }, { "epoch": 0.722057368941642, "grad_norm": 4.7698964565308675, "learning_rate": 4.219494320263833e-05, "loss": 0.6767, "step": 730 }, { "epoch": 0.7230464886251237, "grad_norm": 0.5127544163487205, "learning_rate": 4.217662147306706e-05, "loss": 0.6094, "step": 731 }, { "epoch": 0.7240356083086054, "grad_norm": 0.41334473621761253, "learning_rate": 4.215829974349579e-05, "loss": 0.5082, "step": 732 }, { "epoch": 0.7250247279920871, "grad_norm": 0.36240790003835044, "learning_rate": 4.2139978013924516e-05, "loss": 0.5948, "step": 733 }, { "epoch": 0.7260138476755688, "grad_norm": 0.36783320263349717, "learning_rate": 4.212165628435324e-05, "loss": 0.4404, "step": 734 }, { "epoch": 0.7270029673590505, "grad_norm": 0.42754353298521264, "learning_rate": 4.2103334554781976e-05, "loss": 0.5593, "step": 735 }, { "epoch": 0.7279920870425322, "grad_norm": 0.334115716001555, "learning_rate": 4.20850128252107e-05, "loss": 0.5471, "step": 736 }, { "epoch": 0.7289812067260139, "grad_norm": 0.3964403729349191, "learning_rate": 4.206669109563943e-05, "loss": 0.5169, "step": 737 }, { "epoch": 0.7299703264094956, "grad_norm": 0.3524084819698355, "learning_rate": 4.2048369366068155e-05, "loss": 0.5195, "step": 738 }, { "epoch": 0.7309594460929772, "grad_norm": 0.8826038613244607, "learning_rate": 4.203004763649689e-05, "loss": 0.4875, "step": 739 }, { "epoch": 0.731948565776459, "grad_norm": 0.39827011655841327, "learning_rate": 4.2011725906925615e-05, "loss": 0.5543, "step": 740 }, { "epoch": 0.7329376854599406, "grad_norm": 0.42818287781464126, "learning_rate": 4.199340417735435e-05, "loss": 0.5792, "step": 741 }, { "epoch": 0.7339268051434223, "grad_norm": 1.6645913416124833, "learning_rate": 4.197508244778307e-05, "loss": 0.5845, "step": 742 }, { "epoch": 0.734915924826904, "grad_norm": 0.41168909047511215, "learning_rate": 4.19567607182118e-05, "loss": 0.5943, "step": 743 }, { "epoch": 0.7359050445103857, "grad_norm": 0.48966326888171574, "learning_rate": 4.193843898864053e-05, "loss": 0.5834, "step": 744 }, { "epoch": 0.7368941641938674, "grad_norm": 0.7123014415902675, "learning_rate": 4.192011725906926e-05, "loss": 0.5631, "step": 745 }, { "epoch": 0.7378832838773491, "grad_norm": 0.44472269484910176, "learning_rate": 4.190179552949799e-05, "loss": 0.6197, "step": 746 }, { "epoch": 0.7388724035608308, "grad_norm": 0.5054408180848112, "learning_rate": 4.1883473799926713e-05, "loss": 0.5104, "step": 747 }, { "epoch": 0.7398615232443125, "grad_norm": 0.5692571400825903, "learning_rate": 4.1865152070355447e-05, "loss": 0.5738, "step": 748 }, { "epoch": 0.7408506429277942, "grad_norm": 0.45821840458928037, "learning_rate": 4.184683034078417e-05, "loss": 0.5037, "step": 749 }, { "epoch": 0.7418397626112759, "grad_norm": 1.64441512213632, "learning_rate": 4.18285086112129e-05, "loss": 0.5396, "step": 750 }, { "epoch": 0.7428288822947576, "grad_norm": 0.45495252031691974, "learning_rate": 4.1810186881641626e-05, "loss": 0.5177, "step": 751 }, { "epoch": 0.7438180019782393, "grad_norm": 0.49653489147704893, "learning_rate": 4.179186515207036e-05, "loss": 0.6201, "step": 752 }, { "epoch": 0.744807121661721, "grad_norm": 0.3524624469672887, "learning_rate": 4.1773543422499085e-05, "loss": 0.492, "step": 753 }, { "epoch": 0.7457962413452027, "grad_norm": 0.4437060967033037, "learning_rate": 4.175522169292781e-05, "loss": 0.5195, "step": 754 }, { "epoch": 0.7467853610286844, "grad_norm": 0.5587440781800124, "learning_rate": 4.173689996335654e-05, "loss": 0.5915, "step": 755 }, { "epoch": 0.7477744807121661, "grad_norm": 0.3503317806200955, "learning_rate": 4.171857823378527e-05, "loss": 0.5469, "step": 756 }, { "epoch": 0.7487636003956478, "grad_norm": 0.6127018887132037, "learning_rate": 4.1700256504214005e-05, "loss": 0.593, "step": 757 }, { "epoch": 0.7497527200791295, "grad_norm": 0.37713529443389476, "learning_rate": 4.168193477464273e-05, "loss": 0.5149, "step": 758 }, { "epoch": 0.7507418397626113, "grad_norm": 0.4418061035198679, "learning_rate": 4.166361304507146e-05, "loss": 0.5793, "step": 759 }, { "epoch": 0.751730959446093, "grad_norm": 0.45422082453440427, "learning_rate": 4.1645291315500184e-05, "loss": 0.5298, "step": 760 }, { "epoch": 0.7527200791295747, "grad_norm": 0.3609291523640555, "learning_rate": 4.162696958592892e-05, "loss": 0.5442, "step": 761 }, { "epoch": 0.7537091988130564, "grad_norm": 0.3952489577476288, "learning_rate": 4.1608647856357644e-05, "loss": 0.4714, "step": 762 }, { "epoch": 0.7546983184965381, "grad_norm": 0.4370926922917512, "learning_rate": 4.159032612678637e-05, "loss": 0.5229, "step": 763 }, { "epoch": 0.7556874381800198, "grad_norm": 0.37870613078543963, "learning_rate": 4.1572004397215096e-05, "loss": 0.4551, "step": 764 }, { "epoch": 0.7566765578635015, "grad_norm": 0.43124183371724567, "learning_rate": 4.155368266764383e-05, "loss": 0.5467, "step": 765 }, { "epoch": 0.7576656775469832, "grad_norm": 0.3746671658838302, "learning_rate": 4.1535360938072556e-05, "loss": 0.5807, "step": 766 }, { "epoch": 0.7586547972304649, "grad_norm": 0.46058945803907064, "learning_rate": 4.151703920850128e-05, "loss": 0.5734, "step": 767 }, { "epoch": 0.7596439169139466, "grad_norm": 0.3766410341280779, "learning_rate": 4.149871747893001e-05, "loss": 0.531, "step": 768 }, { "epoch": 0.7606330365974283, "grad_norm": 0.4917230854366619, "learning_rate": 4.148039574935874e-05, "loss": 0.529, "step": 769 }, { "epoch": 0.76162215628091, "grad_norm": 0.3601762759017485, "learning_rate": 4.1462074019787475e-05, "loss": 0.508, "step": 770 }, { "epoch": 0.7626112759643917, "grad_norm": 1.0927135392910616, "learning_rate": 4.1443752290216195e-05, "loss": 0.5042, "step": 771 }, { "epoch": 0.7636003956478734, "grad_norm": 0.564538713331542, "learning_rate": 4.142543056064493e-05, "loss": 0.5492, "step": 772 }, { "epoch": 0.7645895153313551, "grad_norm": 0.378038419799154, "learning_rate": 4.1407108831073654e-05, "loss": 0.5775, "step": 773 }, { "epoch": 0.7655786350148368, "grad_norm": 0.5132969793247761, "learning_rate": 4.138878710150239e-05, "loss": 0.543, "step": 774 }, { "epoch": 0.7665677546983185, "grad_norm": 0.4657837803710372, "learning_rate": 4.1370465371931114e-05, "loss": 0.599, "step": 775 }, { "epoch": 0.7675568743818002, "grad_norm": 0.41364334636794253, "learning_rate": 4.135214364235984e-05, "loss": 0.4887, "step": 776 }, { "epoch": 0.7685459940652819, "grad_norm": 0.48494156005877015, "learning_rate": 4.133382191278857e-05, "loss": 0.5205, "step": 777 }, { "epoch": 0.7695351137487636, "grad_norm": 0.5890553667830905, "learning_rate": 4.13155001832173e-05, "loss": 0.5641, "step": 778 }, { "epoch": 0.7705242334322453, "grad_norm": 0.4152911192832434, "learning_rate": 4.1297178453646027e-05, "loss": 0.5257, "step": 779 }, { "epoch": 0.771513353115727, "grad_norm": 1.6781124035661552, "learning_rate": 4.127885672407475e-05, "loss": 0.602, "step": 780 }, { "epoch": 0.7725024727992087, "grad_norm": 0.5975859652214398, "learning_rate": 4.1260534994503486e-05, "loss": 0.5373, "step": 781 }, { "epoch": 0.7734915924826904, "grad_norm": 0.6170145683827101, "learning_rate": 4.124221326493221e-05, "loss": 0.5326, "step": 782 }, { "epoch": 0.7744807121661721, "grad_norm": 0.612440557441423, "learning_rate": 4.122389153536094e-05, "loss": 0.5741, "step": 783 }, { "epoch": 0.7754698318496538, "grad_norm": 0.43061617125576557, "learning_rate": 4.1205569805789665e-05, "loss": 0.5607, "step": 784 }, { "epoch": 0.7764589515331355, "grad_norm": 0.5036215670510789, "learning_rate": 4.11872480762184e-05, "loss": 0.558, "step": 785 }, { "epoch": 0.7774480712166172, "grad_norm": 0.44972944661661773, "learning_rate": 4.1168926346647125e-05, "loss": 0.5524, "step": 786 }, { "epoch": 0.7784371909000989, "grad_norm": 0.5179361442529129, "learning_rate": 4.115060461707586e-05, "loss": 0.5389, "step": 787 }, { "epoch": 0.7794263105835806, "grad_norm": 0.40648583153660667, "learning_rate": 4.113228288750458e-05, "loss": 0.5839, "step": 788 }, { "epoch": 0.7804154302670623, "grad_norm": 0.4831293697463915, "learning_rate": 4.111396115793331e-05, "loss": 0.571, "step": 789 }, { "epoch": 0.781404549950544, "grad_norm": 0.8247754938487293, "learning_rate": 4.109563942836204e-05, "loss": 0.5338, "step": 790 }, { "epoch": 0.7823936696340257, "grad_norm": 0.4025900842786157, "learning_rate": 4.107731769879077e-05, "loss": 0.5634, "step": 791 }, { "epoch": 0.7833827893175074, "grad_norm": 0.40749518524574807, "learning_rate": 4.105899596921949e-05, "loss": 0.4944, "step": 792 }, { "epoch": 0.7843719090009891, "grad_norm": 0.5169892850825292, "learning_rate": 4.1040674239648223e-05, "loss": 0.5161, "step": 793 }, { "epoch": 0.7853610286844708, "grad_norm": 0.35450257355535614, "learning_rate": 4.102235251007696e-05, "loss": 0.5398, "step": 794 }, { "epoch": 0.7863501483679525, "grad_norm": 0.4013582591042465, "learning_rate": 4.100403078050568e-05, "loss": 0.5003, "step": 795 }, { "epoch": 0.7873392680514342, "grad_norm": 0.37673552582477354, "learning_rate": 4.098570905093441e-05, "loss": 0.4951, "step": 796 }, { "epoch": 0.7883283877349159, "grad_norm": 0.33875066525744113, "learning_rate": 4.0967387321363136e-05, "loss": 0.4806, "step": 797 }, { "epoch": 0.7893175074183977, "grad_norm": 0.476408908388567, "learning_rate": 4.094906559179187e-05, "loss": 0.5316, "step": 798 }, { "epoch": 0.7903066271018794, "grad_norm": 0.3860532691526498, "learning_rate": 4.0930743862220596e-05, "loss": 0.5714, "step": 799 }, { "epoch": 0.7912957467853611, "grad_norm": 0.30426399871981497, "learning_rate": 4.091242213264932e-05, "loss": 0.548, "step": 800 }, { "epoch": 0.7922848664688428, "grad_norm": 0.43022889880235965, "learning_rate": 4.089410040307805e-05, "loss": 0.4889, "step": 801 }, { "epoch": 0.7932739861523245, "grad_norm": 0.4014383758314632, "learning_rate": 4.087577867350678e-05, "loss": 0.5388, "step": 802 }, { "epoch": 0.7942631058358062, "grad_norm": 0.358329442879335, "learning_rate": 4.085745694393551e-05, "loss": 0.6311, "step": 803 }, { "epoch": 0.7952522255192879, "grad_norm": 0.5003987855371286, "learning_rate": 4.083913521436424e-05, "loss": 0.5505, "step": 804 }, { "epoch": 0.7962413452027696, "grad_norm": 0.39761394658298255, "learning_rate": 4.082081348479297e-05, "loss": 0.5423, "step": 805 }, { "epoch": 0.7972304648862513, "grad_norm": 0.39424324806445543, "learning_rate": 4.0802491755221694e-05, "loss": 0.5376, "step": 806 }, { "epoch": 0.798219584569733, "grad_norm": 0.4111741063989657, "learning_rate": 4.078417002565043e-05, "loss": 0.5637, "step": 807 }, { "epoch": 0.7992087042532147, "grad_norm": 0.4089042750817777, "learning_rate": 4.0765848296079154e-05, "loss": 0.6027, "step": 808 }, { "epoch": 0.8001978239366964, "grad_norm": 0.35649158749683113, "learning_rate": 4.074752656650788e-05, "loss": 0.5493, "step": 809 }, { "epoch": 0.8011869436201781, "grad_norm": 0.34390908785069546, "learning_rate": 4.0729204836936606e-05, "loss": 0.4582, "step": 810 }, { "epoch": 0.8021760633036598, "grad_norm": 0.46108022004907095, "learning_rate": 4.071088310736534e-05, "loss": 0.589, "step": 811 }, { "epoch": 0.8031651829871415, "grad_norm": 0.4470976570889435, "learning_rate": 4.0692561377794066e-05, "loss": 0.5783, "step": 812 }, { "epoch": 0.8041543026706232, "grad_norm": 0.41289681603112355, "learning_rate": 4.067423964822279e-05, "loss": 0.5335, "step": 813 }, { "epoch": 0.8051434223541049, "grad_norm": 0.3897455999305961, "learning_rate": 4.065591791865152e-05, "loss": 0.5395, "step": 814 }, { "epoch": 0.8061325420375866, "grad_norm": 0.5398949072700544, "learning_rate": 4.063759618908025e-05, "loss": 0.5419, "step": 815 }, { "epoch": 0.8071216617210683, "grad_norm": 0.40262994039410505, "learning_rate": 4.061927445950898e-05, "loss": 0.5674, "step": 816 }, { "epoch": 0.80811078140455, "grad_norm": 8.042258970876679, "learning_rate": 4.0600952729937705e-05, "loss": 1.1958, "step": 817 }, { "epoch": 0.8090999010880316, "grad_norm": 0.7150914848705251, "learning_rate": 4.058263100036644e-05, "loss": 0.4752, "step": 818 }, { "epoch": 0.8100890207715133, "grad_norm": 0.37450227733321634, "learning_rate": 4.0564309270795165e-05, "loss": 0.519, "step": 819 }, { "epoch": 0.811078140454995, "grad_norm": 0.7771184561273069, "learning_rate": 4.05459875412239e-05, "loss": 0.5667, "step": 820 }, { "epoch": 0.8120672601384767, "grad_norm": 0.39320567398584594, "learning_rate": 4.0527665811652624e-05, "loss": 0.5189, "step": 821 }, { "epoch": 0.8130563798219584, "grad_norm": 0.6164066236372652, "learning_rate": 4.050934408208135e-05, "loss": 0.5956, "step": 822 }, { "epoch": 0.8140454995054401, "grad_norm": 0.43784506001787876, "learning_rate": 4.049102235251008e-05, "loss": 0.5005, "step": 823 }, { "epoch": 0.8150346191889218, "grad_norm": 0.650347294511098, "learning_rate": 4.047270062293881e-05, "loss": 0.5911, "step": 824 }, { "epoch": 0.8160237388724035, "grad_norm": 0.3924537077442831, "learning_rate": 4.045437889336754e-05, "loss": 0.5942, "step": 825 }, { "epoch": 0.8170128585558852, "grad_norm": 0.5306835116575841, "learning_rate": 4.043605716379626e-05, "loss": 0.591, "step": 826 }, { "epoch": 0.8180019782393669, "grad_norm": 0.47068526472918815, "learning_rate": 4.041773543422499e-05, "loss": 0.5701, "step": 827 }, { "epoch": 0.8189910979228486, "grad_norm": 0.5433358297169092, "learning_rate": 4.039941370465372e-05, "loss": 0.5032, "step": 828 }, { "epoch": 0.8199802176063303, "grad_norm": 0.4117742532527006, "learning_rate": 4.038109197508245e-05, "loss": 0.5051, "step": 829 }, { "epoch": 0.820969337289812, "grad_norm": 0.45313367798585935, "learning_rate": 4.0362770245511175e-05, "loss": 0.5236, "step": 830 }, { "epoch": 0.8219584569732937, "grad_norm": 0.4743390307289405, "learning_rate": 4.034444851593991e-05, "loss": 0.5661, "step": 831 }, { "epoch": 0.8229475766567754, "grad_norm": 0.43419602719141287, "learning_rate": 4.0326126786368635e-05, "loss": 0.5262, "step": 832 }, { "epoch": 0.8239366963402571, "grad_norm": 0.44595712321645226, "learning_rate": 4.030780505679737e-05, "loss": 0.557, "step": 833 }, { "epoch": 0.8249258160237388, "grad_norm": 0.46065824846474296, "learning_rate": 4.028948332722609e-05, "loss": 0.5425, "step": 834 }, { "epoch": 0.8259149357072205, "grad_norm": 0.47395010516783537, "learning_rate": 4.027116159765482e-05, "loss": 0.5432, "step": 835 }, { "epoch": 0.8269040553907022, "grad_norm": 0.409348387036287, "learning_rate": 4.025283986808355e-05, "loss": 0.5371, "step": 836 }, { "epoch": 0.827893175074184, "grad_norm": 0.49737572694928917, "learning_rate": 4.023451813851228e-05, "loss": 0.517, "step": 837 }, { "epoch": 0.8288822947576657, "grad_norm": 0.3964787257419728, "learning_rate": 4.0216196408941e-05, "loss": 0.5921, "step": 838 }, { "epoch": 0.8298714144411474, "grad_norm": 0.3783571743456649, "learning_rate": 4.0197874679369734e-05, "loss": 0.5817, "step": 839 }, { "epoch": 0.8308605341246291, "grad_norm": 0.45317557022906846, "learning_rate": 4.017955294979846e-05, "loss": 0.5361, "step": 840 }, { "epoch": 0.8318496538081108, "grad_norm": 0.3855785042525516, "learning_rate": 4.016123122022719e-05, "loss": 0.5561, "step": 841 }, { "epoch": 0.8328387734915925, "grad_norm": 0.37735899597002825, "learning_rate": 4.014290949065592e-05, "loss": 0.4952, "step": 842 }, { "epoch": 0.8338278931750742, "grad_norm": 0.3910395508613529, "learning_rate": 4.0124587761084646e-05, "loss": 0.5248, "step": 843 }, { "epoch": 0.8348170128585559, "grad_norm": 0.41255181351776066, "learning_rate": 4.010626603151338e-05, "loss": 0.6143, "step": 844 }, { "epoch": 0.8358061325420376, "grad_norm": 0.3881103870770859, "learning_rate": 4.0087944301942106e-05, "loss": 0.5858, "step": 845 }, { "epoch": 0.8367952522255193, "grad_norm": 0.4610095412501324, "learning_rate": 4.006962257237083e-05, "loss": 0.4924, "step": 846 }, { "epoch": 0.837784371909001, "grad_norm": 0.36829299206900973, "learning_rate": 4.005130084279956e-05, "loss": 0.5079, "step": 847 }, { "epoch": 0.8387734915924827, "grad_norm": 0.40289319760057474, "learning_rate": 4.003297911322829e-05, "loss": 0.5482, "step": 848 }, { "epoch": 0.8397626112759644, "grad_norm": 0.4206511643989216, "learning_rate": 4.001465738365702e-05, "loss": 0.5605, "step": 849 }, { "epoch": 0.8407517309594461, "grad_norm": 0.41731945925784314, "learning_rate": 3.999633565408575e-05, "loss": 0.6092, "step": 850 }, { "epoch": 0.8417408506429278, "grad_norm": 0.30257425739688576, "learning_rate": 3.997801392451447e-05, "loss": 0.579, "step": 851 }, { "epoch": 0.8427299703264095, "grad_norm": 0.3864935018364275, "learning_rate": 3.9959692194943204e-05, "loss": 0.4771, "step": 852 }, { "epoch": 0.8437190900098912, "grad_norm": 0.3988101514171298, "learning_rate": 3.994137046537194e-05, "loss": 0.5624, "step": 853 }, { "epoch": 0.8447082096933729, "grad_norm": 0.39420755605728475, "learning_rate": 3.9923048735800664e-05, "loss": 0.5568, "step": 854 }, { "epoch": 0.8456973293768546, "grad_norm": 0.4921966493944131, "learning_rate": 3.990472700622939e-05, "loss": 0.5038, "step": 855 }, { "epoch": 0.8466864490603363, "grad_norm": 0.4365043760382965, "learning_rate": 3.9886405276658117e-05, "loss": 0.5222, "step": 856 }, { "epoch": 0.847675568743818, "grad_norm": 0.44342345181331283, "learning_rate": 3.986808354708685e-05, "loss": 0.5426, "step": 857 }, { "epoch": 0.8486646884272997, "grad_norm": 0.36474923885017907, "learning_rate": 3.9849761817515576e-05, "loss": 0.4866, "step": 858 }, { "epoch": 0.8496538081107814, "grad_norm": 0.4279054783487708, "learning_rate": 3.98314400879443e-05, "loss": 0.4454, "step": 859 }, { "epoch": 0.8506429277942631, "grad_norm": 0.32658261042232156, "learning_rate": 3.981311835837303e-05, "loss": 0.5478, "step": 860 }, { "epoch": 0.8516320474777448, "grad_norm": 0.3632391166208969, "learning_rate": 3.979479662880176e-05, "loss": 0.5293, "step": 861 }, { "epoch": 0.8526211671612265, "grad_norm": 0.361466100899412, "learning_rate": 3.977647489923049e-05, "loss": 0.5775, "step": 862 }, { "epoch": 0.8536102868447082, "grad_norm": 0.40555093574101875, "learning_rate": 3.9758153169659215e-05, "loss": 0.4856, "step": 863 }, { "epoch": 0.8545994065281899, "grad_norm": 0.3488107870670207, "learning_rate": 3.973983144008794e-05, "loss": 0.514, "step": 864 }, { "epoch": 0.8555885262116716, "grad_norm": 0.3843960708975495, "learning_rate": 3.9721509710516675e-05, "loss": 0.5101, "step": 865 }, { "epoch": 0.8565776458951533, "grad_norm": 0.41358392461083593, "learning_rate": 3.970318798094541e-05, "loss": 0.5516, "step": 866 }, { "epoch": 0.857566765578635, "grad_norm": 0.32460712314978024, "learning_rate": 3.9684866251374134e-05, "loss": 0.5351, "step": 867 }, { "epoch": 0.8585558852621167, "grad_norm": 0.4590823844091551, "learning_rate": 3.966654452180286e-05, "loss": 0.6195, "step": 868 }, { "epoch": 0.8595450049455984, "grad_norm": 0.36553223034798327, "learning_rate": 3.964822279223159e-05, "loss": 0.5288, "step": 869 }, { "epoch": 0.8605341246290801, "grad_norm": 0.42785798677542214, "learning_rate": 3.962990106266032e-05, "loss": 0.5341, "step": 870 }, { "epoch": 0.8615232443125618, "grad_norm": 0.34046802166560325, "learning_rate": 3.961157933308905e-05, "loss": 0.4989, "step": 871 }, { "epoch": 0.8625123639960435, "grad_norm": 0.4203148762636131, "learning_rate": 3.959325760351777e-05, "loss": 0.4626, "step": 872 }, { "epoch": 0.8635014836795252, "grad_norm": 0.46638243088571246, "learning_rate": 3.95749358739465e-05, "loss": 0.5535, "step": 873 }, { "epoch": 0.8644906033630069, "grad_norm": 0.3207059127416101, "learning_rate": 3.955661414437523e-05, "loss": 0.4701, "step": 874 }, { "epoch": 0.8654797230464887, "grad_norm": 0.4361065262778062, "learning_rate": 3.953829241480396e-05, "loss": 0.5232, "step": 875 }, { "epoch": 0.8664688427299704, "grad_norm": 0.4179015275913999, "learning_rate": 3.9519970685232686e-05, "loss": 0.4782, "step": 876 }, { "epoch": 0.8674579624134521, "grad_norm": 0.39023161716199395, "learning_rate": 3.950164895566142e-05, "loss": 0.5666, "step": 877 }, { "epoch": 0.8684470820969338, "grad_norm": 2.1308947045622375, "learning_rate": 3.9483327226090145e-05, "loss": 0.4915, "step": 878 }, { "epoch": 0.8694362017804155, "grad_norm": 0.6355227582113737, "learning_rate": 3.946500549651888e-05, "loss": 0.5845, "step": 879 }, { "epoch": 0.8704253214638972, "grad_norm": 0.4231786188850144, "learning_rate": 3.94466837669476e-05, "loss": 0.5186, "step": 880 }, { "epoch": 0.8714144411473789, "grad_norm": 0.5963762480510161, "learning_rate": 3.942836203737633e-05, "loss": 0.5478, "step": 881 }, { "epoch": 0.8724035608308606, "grad_norm": 0.44967258871723204, "learning_rate": 3.941004030780506e-05, "loss": 0.6044, "step": 882 }, { "epoch": 0.8733926805143423, "grad_norm": 0.49666107894686895, "learning_rate": 3.939171857823379e-05, "loss": 0.5572, "step": 883 }, { "epoch": 0.874381800197824, "grad_norm": 0.4511223362706553, "learning_rate": 3.937339684866251e-05, "loss": 0.5167, "step": 884 }, { "epoch": 0.8753709198813057, "grad_norm": 0.39070885224778895, "learning_rate": 3.9355075119091244e-05, "loss": 0.5711, "step": 885 }, { "epoch": 0.8763600395647874, "grad_norm": 0.4817666139801221, "learning_rate": 3.933675338951997e-05, "loss": 0.5614, "step": 886 }, { "epoch": 0.8773491592482691, "grad_norm": 0.37439555094432153, "learning_rate": 3.93184316599487e-05, "loss": 0.473, "step": 887 }, { "epoch": 0.8783382789317508, "grad_norm": 0.512933754535988, "learning_rate": 3.930010993037743e-05, "loss": 0.5664, "step": 888 }, { "epoch": 0.8793273986152325, "grad_norm": 0.3861195811560788, "learning_rate": 3.9281788200806156e-05, "loss": 0.4614, "step": 889 }, { "epoch": 0.8803165182987142, "grad_norm": 0.5026948257818956, "learning_rate": 3.926346647123489e-05, "loss": 0.5455, "step": 890 }, { "epoch": 0.8813056379821959, "grad_norm": 0.3797918839860446, "learning_rate": 3.9245144741663616e-05, "loss": 0.5669, "step": 891 }, { "epoch": 0.8822947576656776, "grad_norm": 0.3580599914264445, "learning_rate": 3.922682301209234e-05, "loss": 0.4969, "step": 892 }, { "epoch": 0.8832838773491593, "grad_norm": 0.4680809277442312, "learning_rate": 3.920850128252107e-05, "loss": 0.53, "step": 893 }, { "epoch": 0.884272997032641, "grad_norm": 0.3182331740958757, "learning_rate": 3.91901795529498e-05, "loss": 0.4916, "step": 894 }, { "epoch": 0.8852621167161226, "grad_norm": 0.38629102830836864, "learning_rate": 3.917185782337853e-05, "loss": 0.4831, "step": 895 }, { "epoch": 0.8862512363996043, "grad_norm": 0.38003929877472475, "learning_rate": 3.915353609380726e-05, "loss": 0.4544, "step": 896 }, { "epoch": 0.887240356083086, "grad_norm": 0.35819047353737965, "learning_rate": 3.913521436423598e-05, "loss": 0.6131, "step": 897 }, { "epoch": 0.8882294757665677, "grad_norm": 0.391106759277783, "learning_rate": 3.9116892634664714e-05, "loss": 0.5327, "step": 898 }, { "epoch": 0.8892185954500494, "grad_norm": 0.44913570347970116, "learning_rate": 3.909857090509344e-05, "loss": 0.5853, "step": 899 }, { "epoch": 0.8902077151335311, "grad_norm": 0.33654774158156275, "learning_rate": 3.9080249175522174e-05, "loss": 0.5172, "step": 900 }, { "epoch": 0.8911968348170128, "grad_norm": 0.4143598750420622, "learning_rate": 3.90619274459509e-05, "loss": 0.4848, "step": 901 }, { "epoch": 0.8921859545004945, "grad_norm": 0.524991445546096, "learning_rate": 3.904360571637963e-05, "loss": 0.4199, "step": 902 }, { "epoch": 0.8931750741839762, "grad_norm": 0.3927175649035216, "learning_rate": 3.902528398680836e-05, "loss": 0.5881, "step": 903 }, { "epoch": 0.8941641938674579, "grad_norm": 0.4079491608662268, "learning_rate": 3.9006962257237086e-05, "loss": 0.5405, "step": 904 }, { "epoch": 0.8951533135509396, "grad_norm": 0.424142781885458, "learning_rate": 3.898864052766581e-05, "loss": 0.5487, "step": 905 }, { "epoch": 0.8961424332344213, "grad_norm": 0.41683695083750005, "learning_rate": 3.897031879809454e-05, "loss": 0.4787, "step": 906 }, { "epoch": 0.897131552917903, "grad_norm": 0.3547610738651701, "learning_rate": 3.895199706852327e-05, "loss": 0.4979, "step": 907 }, { "epoch": 0.8981206726013847, "grad_norm": 2.3469674956937228, "learning_rate": 3.8933675338952e-05, "loss": 0.5794, "step": 908 }, { "epoch": 0.8991097922848664, "grad_norm": 0.6412020698124778, "learning_rate": 3.8915353609380725e-05, "loss": 0.5268, "step": 909 }, { "epoch": 0.9000989119683481, "grad_norm": 1.0383233848694802, "learning_rate": 3.889703187980945e-05, "loss": 0.5414, "step": 910 }, { "epoch": 0.9010880316518298, "grad_norm": 0.43251475437881326, "learning_rate": 3.8878710150238185e-05, "loss": 0.4797, "step": 911 }, { "epoch": 0.9020771513353115, "grad_norm": 0.5157606933923597, "learning_rate": 3.886038842066692e-05, "loss": 0.5654, "step": 912 }, { "epoch": 0.9030662710187932, "grad_norm": 0.3743540399979926, "learning_rate": 3.8842066691095644e-05, "loss": 0.5119, "step": 913 }, { "epoch": 0.904055390702275, "grad_norm": 0.42098522740414857, "learning_rate": 3.882374496152437e-05, "loss": 0.489, "step": 914 }, { "epoch": 0.9050445103857567, "grad_norm": 0.5243073127176688, "learning_rate": 3.88054232319531e-05, "loss": 0.5039, "step": 915 }, { "epoch": 0.9060336300692384, "grad_norm": 0.3986349967328183, "learning_rate": 3.878710150238183e-05, "loss": 0.5337, "step": 916 }, { "epoch": 0.9070227497527201, "grad_norm": 0.5429828786199771, "learning_rate": 3.876877977281056e-05, "loss": 0.5475, "step": 917 }, { "epoch": 0.9080118694362018, "grad_norm": 0.44994465926714916, "learning_rate": 3.875045804323928e-05, "loss": 0.5136, "step": 918 }, { "epoch": 0.9090009891196835, "grad_norm": 0.3415738750821067, "learning_rate": 3.873213631366801e-05, "loss": 0.5346, "step": 919 }, { "epoch": 0.9099901088031652, "grad_norm": 0.3937349524036801, "learning_rate": 3.871381458409674e-05, "loss": 0.466, "step": 920 }, { "epoch": 0.9109792284866469, "grad_norm": 0.5131695235804504, "learning_rate": 3.869549285452547e-05, "loss": 0.6701, "step": 921 }, { "epoch": 0.9119683481701286, "grad_norm": 0.33653367131443224, "learning_rate": 3.8677171124954196e-05, "loss": 0.4473, "step": 922 }, { "epoch": 0.9129574678536103, "grad_norm": 0.5446871845384862, "learning_rate": 3.865884939538292e-05, "loss": 0.5528, "step": 923 }, { "epoch": 0.913946587537092, "grad_norm": 0.5015814740270789, "learning_rate": 3.8640527665811655e-05, "loss": 0.5226, "step": 924 }, { "epoch": 0.9149357072205737, "grad_norm": 0.7983585338010668, "learning_rate": 3.862220593624039e-05, "loss": 0.4371, "step": 925 }, { "epoch": 0.9159248269040554, "grad_norm": 0.48904182262923174, "learning_rate": 3.860388420666911e-05, "loss": 0.5594, "step": 926 }, { "epoch": 0.9169139465875371, "grad_norm": 0.4503935788610765, "learning_rate": 3.858556247709784e-05, "loss": 0.6133, "step": 927 }, { "epoch": 0.9179030662710188, "grad_norm": 0.3165947004747236, "learning_rate": 3.856724074752657e-05, "loss": 0.4772, "step": 928 }, { "epoch": 0.9188921859545005, "grad_norm": 0.5586077488303228, "learning_rate": 3.85489190179553e-05, "loss": 0.5199, "step": 929 }, { "epoch": 0.9198813056379822, "grad_norm": 0.5085374651276036, "learning_rate": 3.853059728838402e-05, "loss": 0.5306, "step": 930 }, { "epoch": 0.9208704253214639, "grad_norm": 0.4129244011177407, "learning_rate": 3.8512275558812754e-05, "loss": 0.4806, "step": 931 }, { "epoch": 0.9218595450049456, "grad_norm": 0.4861459748388243, "learning_rate": 3.849395382924148e-05, "loss": 0.4908, "step": 932 }, { "epoch": 0.9228486646884273, "grad_norm": 0.41526131109253767, "learning_rate": 3.847563209967021e-05, "loss": 0.5276, "step": 933 }, { "epoch": 0.923837784371909, "grad_norm": 0.4009408084973653, "learning_rate": 3.845731037009894e-05, "loss": 0.5562, "step": 934 }, { "epoch": 0.9248269040553907, "grad_norm": 0.5745910621076383, "learning_rate": 3.8438988640527666e-05, "loss": 0.5774, "step": 935 }, { "epoch": 0.9258160237388724, "grad_norm": 0.4637760415387979, "learning_rate": 3.84206669109564e-05, "loss": 0.5435, "step": 936 }, { "epoch": 0.9268051434223541, "grad_norm": 0.41864980359230897, "learning_rate": 3.8402345181385126e-05, "loss": 0.5391, "step": 937 }, { "epoch": 0.9277942631058358, "grad_norm": 1.1869062780052408, "learning_rate": 3.838402345181385e-05, "loss": 0.5023, "step": 938 }, { "epoch": 0.9287833827893175, "grad_norm": 0.43776779645633507, "learning_rate": 3.836570172224258e-05, "loss": 0.5096, "step": 939 }, { "epoch": 0.9297725024727992, "grad_norm": 0.5198208217050955, "learning_rate": 3.834737999267131e-05, "loss": 0.5335, "step": 940 }, { "epoch": 0.9307616221562809, "grad_norm": 0.5235285409308027, "learning_rate": 3.832905826310004e-05, "loss": 0.5847, "step": 941 }, { "epoch": 0.9317507418397626, "grad_norm": 0.44126732924600726, "learning_rate": 3.831073653352877e-05, "loss": 0.4719, "step": 942 }, { "epoch": 0.9327398615232443, "grad_norm": 0.5054573511058064, "learning_rate": 3.829241480395749e-05, "loss": 0.5096, "step": 943 }, { "epoch": 0.933728981206726, "grad_norm": 0.5805833518404947, "learning_rate": 3.8274093074386224e-05, "loss": 0.5011, "step": 944 }, { "epoch": 0.9347181008902077, "grad_norm": 2.2314124854517408, "learning_rate": 3.825577134481495e-05, "loss": 0.5527, "step": 945 }, { "epoch": 0.9357072205736894, "grad_norm": 0.5483697457861862, "learning_rate": 3.8237449615243684e-05, "loss": 0.5321, "step": 946 }, { "epoch": 0.9366963402571711, "grad_norm": 0.47552103451002065, "learning_rate": 3.8219127885672404e-05, "loss": 0.5372, "step": 947 }, { "epoch": 0.9376854599406528, "grad_norm": 0.31286512825007917, "learning_rate": 3.820080615610114e-05, "loss": 0.4985, "step": 948 }, { "epoch": 0.9386745796241345, "grad_norm": 1.7242082421798055, "learning_rate": 3.818248442652987e-05, "loss": 0.5405, "step": 949 }, { "epoch": 0.9396636993076162, "grad_norm": 0.4700031516218264, "learning_rate": 3.8164162696958596e-05, "loss": 0.4871, "step": 950 }, { "epoch": 0.9406528189910979, "grad_norm": 0.38084464993511985, "learning_rate": 3.814584096738732e-05, "loss": 0.4736, "step": 951 }, { "epoch": 0.9416419386745796, "grad_norm": 0.4483615144248822, "learning_rate": 3.812751923781605e-05, "loss": 0.5035, "step": 952 }, { "epoch": 0.9426310583580614, "grad_norm": 0.3741684709974548, "learning_rate": 3.810919750824478e-05, "loss": 0.5079, "step": 953 }, { "epoch": 0.9436201780415431, "grad_norm": 0.3159720304753471, "learning_rate": 3.809087577867351e-05, "loss": 0.5318, "step": 954 }, { "epoch": 0.9446092977250248, "grad_norm": 0.4215113817807427, "learning_rate": 3.8072554049102235e-05, "loss": 0.5136, "step": 955 }, { "epoch": 0.9455984174085065, "grad_norm": 0.37425060046032715, "learning_rate": 3.805423231953096e-05, "loss": 0.5254, "step": 956 }, { "epoch": 0.9465875370919882, "grad_norm": 0.3257675406702953, "learning_rate": 3.8035910589959695e-05, "loss": 0.5079, "step": 957 }, { "epoch": 0.9475766567754699, "grad_norm": 0.45095100461312454, "learning_rate": 3.801758886038842e-05, "loss": 0.512, "step": 958 }, { "epoch": 0.9485657764589516, "grad_norm": 0.4090749619120724, "learning_rate": 3.7999267130817154e-05, "loss": 0.5396, "step": 959 }, { "epoch": 0.9495548961424333, "grad_norm": 0.39265990057296607, "learning_rate": 3.798094540124588e-05, "loss": 0.4991, "step": 960 }, { "epoch": 0.950544015825915, "grad_norm": 0.4063218758435763, "learning_rate": 3.796262367167461e-05, "loss": 0.489, "step": 961 }, { "epoch": 0.9515331355093967, "grad_norm": 0.4829638146436522, "learning_rate": 3.794430194210334e-05, "loss": 0.5859, "step": 962 }, { "epoch": 0.9525222551928784, "grad_norm": 0.33683234605221707, "learning_rate": 3.792598021253207e-05, "loss": 0.5402, "step": 963 }, { "epoch": 0.9535113748763601, "grad_norm": 0.4136710761072618, "learning_rate": 3.790765848296079e-05, "loss": 0.6248, "step": 964 }, { "epoch": 0.9545004945598418, "grad_norm": 0.44419096696115934, "learning_rate": 3.788933675338952e-05, "loss": 0.5687, "step": 965 }, { "epoch": 0.9554896142433235, "grad_norm": 0.314815697263912, "learning_rate": 3.787101502381825e-05, "loss": 0.5257, "step": 966 }, { "epoch": 0.9564787339268052, "grad_norm": 0.3767859758997778, "learning_rate": 3.785269329424698e-05, "loss": 0.4821, "step": 967 }, { "epoch": 0.9574678536102869, "grad_norm": 0.3695256393459967, "learning_rate": 3.7834371564675706e-05, "loss": 0.5113, "step": 968 }, { "epoch": 0.9584569732937686, "grad_norm": 0.29897151173130265, "learning_rate": 3.781604983510443e-05, "loss": 0.5127, "step": 969 }, { "epoch": 0.9594460929772503, "grad_norm": 0.3406098944483567, "learning_rate": 3.7797728105533165e-05, "loss": 0.4787, "step": 970 }, { "epoch": 0.960435212660732, "grad_norm": 0.3727822915676798, "learning_rate": 3.777940637596189e-05, "loss": 0.5348, "step": 971 }, { "epoch": 0.9614243323442137, "grad_norm": 0.3140702134023394, "learning_rate": 3.776108464639062e-05, "loss": 0.4602, "step": 972 }, { "epoch": 0.9624134520276953, "grad_norm": 0.3848508421746314, "learning_rate": 3.774276291681935e-05, "loss": 0.5356, "step": 973 }, { "epoch": 0.963402571711177, "grad_norm": 0.32214698366653616, "learning_rate": 3.772444118724808e-05, "loss": 0.5184, "step": 974 }, { "epoch": 0.9643916913946587, "grad_norm": 0.36714549493193244, "learning_rate": 3.770611945767681e-05, "loss": 0.4755, "step": 975 }, { "epoch": 0.9653808110781404, "grad_norm": 0.34869975067509607, "learning_rate": 3.768779772810553e-05, "loss": 0.6225, "step": 976 }, { "epoch": 0.9663699307616221, "grad_norm": 0.3820217670798009, "learning_rate": 3.7669475998534264e-05, "loss": 0.4952, "step": 977 }, { "epoch": 0.9673590504451038, "grad_norm": 0.3985817080513296, "learning_rate": 3.765115426896299e-05, "loss": 0.5567, "step": 978 }, { "epoch": 0.9683481701285855, "grad_norm": 0.31651201835869763, "learning_rate": 3.7632832539391723e-05, "loss": 0.5074, "step": 979 }, { "epoch": 0.9693372898120672, "grad_norm": 0.37619456746828517, "learning_rate": 3.761451080982045e-05, "loss": 0.5589, "step": 980 }, { "epoch": 0.9703264094955489, "grad_norm": 0.3623920056918156, "learning_rate": 3.7596189080249176e-05, "loss": 0.4757, "step": 981 }, { "epoch": 0.9713155291790306, "grad_norm": 0.3420143952848393, "learning_rate": 3.75778673506779e-05, "loss": 0.4884, "step": 982 }, { "epoch": 0.9723046488625123, "grad_norm": 0.32751462019753536, "learning_rate": 3.7559545621106636e-05, "loss": 0.4947, "step": 983 }, { "epoch": 0.973293768545994, "grad_norm": 0.31318281028088973, "learning_rate": 3.754122389153536e-05, "loss": 0.4836, "step": 984 }, { "epoch": 0.9742828882294757, "grad_norm": 0.38127965484424997, "learning_rate": 3.752290216196409e-05, "loss": 0.5345, "step": 985 }, { "epoch": 0.9752720079129574, "grad_norm": 0.37819403084012576, "learning_rate": 3.750458043239282e-05, "loss": 0.4964, "step": 986 }, { "epoch": 0.9762611275964391, "grad_norm": 0.3560452300150932, "learning_rate": 3.748625870282155e-05, "loss": 0.5693, "step": 987 }, { "epoch": 0.9772502472799208, "grad_norm": 0.40812765024892683, "learning_rate": 3.746793697325028e-05, "loss": 0.5594, "step": 988 }, { "epoch": 0.9782393669634025, "grad_norm": 0.39374505335822296, "learning_rate": 3.7449615243679e-05, "loss": 0.5583, "step": 989 }, { "epoch": 0.9792284866468842, "grad_norm": 0.37540173769820445, "learning_rate": 3.7431293514107734e-05, "loss": 0.4881, "step": 990 }, { "epoch": 0.9802176063303659, "grad_norm": 0.443962903657654, "learning_rate": 3.741297178453646e-05, "loss": 0.5349, "step": 991 }, { "epoch": 0.9812067260138477, "grad_norm": 0.3858662193153671, "learning_rate": 3.7394650054965194e-05, "loss": 0.5483, "step": 992 }, { "epoch": 0.9821958456973294, "grad_norm": 0.8166142256059559, "learning_rate": 3.7376328325393914e-05, "loss": 0.591, "step": 993 }, { "epoch": 0.9831849653808111, "grad_norm": 0.4159757188981098, "learning_rate": 3.735800659582265e-05, "loss": 0.5583, "step": 994 }, { "epoch": 0.9841740850642928, "grad_norm": 0.8142513421373967, "learning_rate": 3.733968486625137e-05, "loss": 0.5662, "step": 995 }, { "epoch": 0.9851632047477745, "grad_norm": 0.491701116849365, "learning_rate": 3.7321363136680106e-05, "loss": 0.4751, "step": 996 }, { "epoch": 0.9861523244312562, "grad_norm": 0.6257416157107429, "learning_rate": 3.730304140710883e-05, "loss": 0.5465, "step": 997 }, { "epoch": 0.9871414441147379, "grad_norm": 0.3803596226603389, "learning_rate": 3.728471967753756e-05, "loss": 0.5614, "step": 998 }, { "epoch": 0.9881305637982196, "grad_norm": 0.48848649838986524, "learning_rate": 3.726639794796629e-05, "loss": 0.5025, "step": 999 }, { "epoch": 0.9891196834817013, "grad_norm": 0.6067005301320401, "learning_rate": 3.724807621839502e-05, "loss": 0.5909, "step": 1000 }, { "epoch": 0.990108803165183, "grad_norm": 0.30491012951863644, "learning_rate": 3.7229754488823745e-05, "loss": 0.5312, "step": 1001 }, { "epoch": 0.9910979228486647, "grad_norm": 0.5263181916386561, "learning_rate": 3.721143275925247e-05, "loss": 0.5418, "step": 1002 }, { "epoch": 0.9920870425321464, "grad_norm": 0.48669107191244604, "learning_rate": 3.7193111029681205e-05, "loss": 0.4754, "step": 1003 }, { "epoch": 0.9930761622156281, "grad_norm": 0.32683564491319556, "learning_rate": 3.717478930010993e-05, "loss": 0.4993, "step": 1004 }, { "epoch": 0.9940652818991098, "grad_norm": 0.41176858367306507, "learning_rate": 3.7156467570538664e-05, "loss": 0.5087, "step": 1005 }, { "epoch": 0.9950544015825915, "grad_norm": 0.47830883072176333, "learning_rate": 3.7138145840967384e-05, "loss": 0.5465, "step": 1006 }, { "epoch": 0.9960435212660732, "grad_norm": 0.3751711273770695, "learning_rate": 3.711982411139612e-05, "loss": 0.5465, "step": 1007 }, { "epoch": 0.9970326409495549, "grad_norm": 0.3863008248417529, "learning_rate": 3.710150238182485e-05, "loss": 0.5453, "step": 1008 }, { "epoch": 0.9980217606330366, "grad_norm": 0.35255489104921495, "learning_rate": 3.708318065225358e-05, "loss": 0.4655, "step": 1009 }, { "epoch": 0.9990108803165183, "grad_norm": 0.3485913687147211, "learning_rate": 3.70648589226823e-05, "loss": 0.5097, "step": 1010 }, { "epoch": 1.0, "grad_norm": 0.4578352977667433, "learning_rate": 3.704653719311103e-05, "loss": 0.5795, "step": 1011 }, { "epoch": 1.0009891196834817, "grad_norm": 0.38768311863290106, "learning_rate": 3.702821546353976e-05, "loss": 0.401, "step": 1012 }, { "epoch": 1.0019782393669634, "grad_norm": 0.3714367929643285, "learning_rate": 3.700989373396849e-05, "loss": 0.43, "step": 1013 }, { "epoch": 1.002967359050445, "grad_norm": 0.39615149577959957, "learning_rate": 3.6991572004397216e-05, "loss": 0.5277, "step": 1014 }, { "epoch": 1.0039564787339268, "grad_norm": 0.2867304423631312, "learning_rate": 3.697325027482594e-05, "loss": 0.4223, "step": 1015 }, { "epoch": 1.0049455984174085, "grad_norm": 0.3693181837334904, "learning_rate": 3.6954928545254675e-05, "loss": 0.4469, "step": 1016 }, { "epoch": 1.0059347181008902, "grad_norm": 0.34055845537716634, "learning_rate": 3.69366068156834e-05, "loss": 0.4902, "step": 1017 }, { "epoch": 1.006923837784372, "grad_norm": 0.3429833347762005, "learning_rate": 3.691828508611213e-05, "loss": 0.4396, "step": 1018 }, { "epoch": 1.0079129574678536, "grad_norm": 0.3693590550132719, "learning_rate": 3.6899963356540855e-05, "loss": 0.425, "step": 1019 }, { "epoch": 1.0089020771513353, "grad_norm": 3.0499675675684976, "learning_rate": 3.688164162696959e-05, "loss": 0.5673, "step": 1020 }, { "epoch": 1.009891196834817, "grad_norm": 3.45703439958464, "learning_rate": 3.686331989739832e-05, "loss": 0.4238, "step": 1021 }, { "epoch": 1.0108803165182987, "grad_norm": 1.1262732708801717, "learning_rate": 3.684499816782704e-05, "loss": 0.4644, "step": 1022 }, { "epoch": 1.0118694362017804, "grad_norm": 0.35784290295603, "learning_rate": 3.6826676438255774e-05, "loss": 0.489, "step": 1023 }, { "epoch": 1.012858555885262, "grad_norm": 0.4608616029921624, "learning_rate": 3.68083547086845e-05, "loss": 0.4936, "step": 1024 }, { "epoch": 1.0138476755687438, "grad_norm": 0.424887693355373, "learning_rate": 3.6790032979113233e-05, "loss": 0.5167, "step": 1025 }, { "epoch": 1.0148367952522255, "grad_norm": 0.447495609689506, "learning_rate": 3.677171124954196e-05, "loss": 0.4492, "step": 1026 }, { "epoch": 1.0158259149357072, "grad_norm": 0.36530975855838116, "learning_rate": 3.6753389519970686e-05, "loss": 0.4778, "step": 1027 }, { "epoch": 1.0168150346191889, "grad_norm": 1.9814451602515812, "learning_rate": 3.673506779039941e-05, "loss": 0.4786, "step": 1028 }, { "epoch": 1.0178041543026706, "grad_norm": 0.5630016104667595, "learning_rate": 3.6716746060828146e-05, "loss": 0.4327, "step": 1029 }, { "epoch": 1.0187932739861523, "grad_norm": 0.36111632730607574, "learning_rate": 3.669842433125687e-05, "loss": 0.4235, "step": 1030 }, { "epoch": 1.019782393669634, "grad_norm": 5.436341015640222, "learning_rate": 3.66801026016856e-05, "loss": 0.5393, "step": 1031 }, { "epoch": 1.0207715133531157, "grad_norm": 0.7375907758295829, "learning_rate": 3.666178087211433e-05, "loss": 0.4285, "step": 1032 }, { "epoch": 1.0217606330365974, "grad_norm": 0.3037209911593098, "learning_rate": 3.664345914254306e-05, "loss": 0.4599, "step": 1033 }, { "epoch": 1.022749752720079, "grad_norm": 0.7465649356485863, "learning_rate": 3.662513741297179e-05, "loss": 0.4379, "step": 1034 }, { "epoch": 1.0237388724035608, "grad_norm": 0.43790277448859827, "learning_rate": 3.660681568340051e-05, "loss": 0.4044, "step": 1035 }, { "epoch": 1.0247279920870425, "grad_norm": 0.6032981713354464, "learning_rate": 3.6588493953829244e-05, "loss": 0.4908, "step": 1036 }, { "epoch": 1.0257171117705242, "grad_norm": 0.5761051820115305, "learning_rate": 3.657017222425797e-05, "loss": 0.4773, "step": 1037 }, { "epoch": 1.0267062314540059, "grad_norm": 0.7647537299042617, "learning_rate": 3.6551850494686704e-05, "loss": 0.4242, "step": 1038 }, { "epoch": 1.0276953511374876, "grad_norm": 0.5762122743965227, "learning_rate": 3.6533528765115424e-05, "loss": 0.4405, "step": 1039 }, { "epoch": 1.0286844708209693, "grad_norm": 0.3009983442497535, "learning_rate": 3.651520703554416e-05, "loss": 0.4678, "step": 1040 }, { "epoch": 1.029673590504451, "grad_norm": 0.562112523671321, "learning_rate": 3.649688530597288e-05, "loss": 0.4621, "step": 1041 }, { "epoch": 1.0306627101879327, "grad_norm": 0.37989462590052814, "learning_rate": 3.6478563576401616e-05, "loss": 0.4267, "step": 1042 }, { "epoch": 1.0316518298714143, "grad_norm": 1.0300574287245914, "learning_rate": 3.646024184683034e-05, "loss": 0.4476, "step": 1043 }, { "epoch": 1.032640949554896, "grad_norm": 0.556363323634819, "learning_rate": 3.644192011725907e-05, "loss": 0.4468, "step": 1044 }, { "epoch": 1.0336300692383777, "grad_norm": 0.3386788946695046, "learning_rate": 3.64235983876878e-05, "loss": 0.4364, "step": 1045 }, { "epoch": 1.0346191889218594, "grad_norm": 0.5192836886115852, "learning_rate": 3.640527665811653e-05, "loss": 0.4294, "step": 1046 }, { "epoch": 1.0356083086053411, "grad_norm": 0.37484641311334366, "learning_rate": 3.6386954928545255e-05, "loss": 0.4691, "step": 1047 }, { "epoch": 1.0365974282888228, "grad_norm": 0.43983528468602306, "learning_rate": 3.636863319897398e-05, "loss": 0.4607, "step": 1048 }, { "epoch": 1.0375865479723045, "grad_norm": 1.031961425019673, "learning_rate": 3.6350311469402715e-05, "loss": 0.5229, "step": 1049 }, { "epoch": 1.0385756676557865, "grad_norm": 3.2210405891971194, "learning_rate": 3.633198973983144e-05, "loss": 0.5568, "step": 1050 }, { "epoch": 1.039564787339268, "grad_norm": 0.6782949578972237, "learning_rate": 3.6313668010260175e-05, "loss": 0.4354, "step": 1051 }, { "epoch": 1.0405539070227499, "grad_norm": 0.4127185908000618, "learning_rate": 3.6295346280688894e-05, "loss": 0.4387, "step": 1052 }, { "epoch": 1.0415430267062316, "grad_norm": 0.490074753069455, "learning_rate": 3.627702455111763e-05, "loss": 0.4307, "step": 1053 }, { "epoch": 1.0425321463897133, "grad_norm": 0.6921248034479954, "learning_rate": 3.6258702821546354e-05, "loss": 0.4723, "step": 1054 }, { "epoch": 1.043521266073195, "grad_norm": 0.393153265610656, "learning_rate": 3.624038109197509e-05, "loss": 0.4716, "step": 1055 }, { "epoch": 1.0445103857566767, "grad_norm": 0.5649667904822261, "learning_rate": 3.6222059362403813e-05, "loss": 0.4793, "step": 1056 }, { "epoch": 1.0454995054401583, "grad_norm": 1.8299145260790468, "learning_rate": 3.620373763283254e-05, "loss": 0.4621, "step": 1057 }, { "epoch": 1.04648862512364, "grad_norm": 1.725090631699042, "learning_rate": 3.618541590326127e-05, "loss": 0.4317, "step": 1058 }, { "epoch": 1.0474777448071217, "grad_norm": 0.537785509003733, "learning_rate": 3.616709417369e-05, "loss": 0.4513, "step": 1059 }, { "epoch": 1.0484668644906034, "grad_norm": 0.33792548688856516, "learning_rate": 3.6148772444118726e-05, "loss": 0.4657, "step": 1060 }, { "epoch": 1.0494559841740851, "grad_norm": 0.8248367654993363, "learning_rate": 3.613045071454745e-05, "loss": 0.4434, "step": 1061 }, { "epoch": 1.0504451038575668, "grad_norm": 0.4266254409159955, "learning_rate": 3.6112128984976185e-05, "loss": 0.4681, "step": 1062 }, { "epoch": 1.0514342235410485, "grad_norm": 0.42576334310945435, "learning_rate": 3.609380725540491e-05, "loss": 0.4561, "step": 1063 }, { "epoch": 1.0524233432245302, "grad_norm": 0.5266659764246051, "learning_rate": 3.607548552583364e-05, "loss": 0.4802, "step": 1064 }, { "epoch": 1.053412462908012, "grad_norm": 0.810147632482765, "learning_rate": 3.6057163796262365e-05, "loss": 0.4591, "step": 1065 }, { "epoch": 1.0544015825914936, "grad_norm": 0.413804915322679, "learning_rate": 3.60388420666911e-05, "loss": 0.3825, "step": 1066 }, { "epoch": 1.0553907022749753, "grad_norm": 0.44504381838193996, "learning_rate": 3.602052033711983e-05, "loss": 0.4876, "step": 1067 }, { "epoch": 1.056379821958457, "grad_norm": 0.4439897157440488, "learning_rate": 3.600219860754855e-05, "loss": 0.4253, "step": 1068 }, { "epoch": 1.0573689416419387, "grad_norm": 0.3500704299271919, "learning_rate": 3.5983876877977284e-05, "loss": 0.4336, "step": 1069 }, { "epoch": 1.0583580613254204, "grad_norm": 0.33953086838015223, "learning_rate": 3.596555514840601e-05, "loss": 0.4255, "step": 1070 }, { "epoch": 1.0593471810089021, "grad_norm": 0.48346975571239686, "learning_rate": 3.5947233418834744e-05, "loss": 0.4609, "step": 1071 }, { "epoch": 1.0603363006923838, "grad_norm": 0.3187757649005785, "learning_rate": 3.592891168926347e-05, "loss": 0.4665, "step": 1072 }, { "epoch": 1.0613254203758655, "grad_norm": 0.42833406748043845, "learning_rate": 3.5910589959692196e-05, "loss": 0.3678, "step": 1073 }, { "epoch": 1.0623145400593472, "grad_norm": 0.4566593411257765, "learning_rate": 3.589226823012092e-05, "loss": 0.4284, "step": 1074 }, { "epoch": 1.063303659742829, "grad_norm": 0.3468485072829719, "learning_rate": 3.5873946500549656e-05, "loss": 0.4746, "step": 1075 }, { "epoch": 1.0642927794263106, "grad_norm": 0.510162571679684, "learning_rate": 3.585562477097838e-05, "loss": 0.4224, "step": 1076 }, { "epoch": 1.0652818991097923, "grad_norm": 0.3979585894735656, "learning_rate": 3.583730304140711e-05, "loss": 0.4646, "step": 1077 }, { "epoch": 1.066271018793274, "grad_norm": 0.36760884964977675, "learning_rate": 3.5818981311835835e-05, "loss": 0.4491, "step": 1078 }, { "epoch": 1.0672601384767557, "grad_norm": 0.5145237916838321, "learning_rate": 3.580065958226457e-05, "loss": 0.49, "step": 1079 }, { "epoch": 1.0682492581602374, "grad_norm": 0.3129933723610313, "learning_rate": 3.57823378526933e-05, "loss": 0.443, "step": 1080 }, { "epoch": 1.0692383778437191, "grad_norm": 0.44086395340998813, "learning_rate": 3.576401612312202e-05, "loss": 0.4624, "step": 1081 }, { "epoch": 1.0702274975272008, "grad_norm": 0.3521648112115606, "learning_rate": 3.5745694393550754e-05, "loss": 0.492, "step": 1082 }, { "epoch": 1.0712166172106825, "grad_norm": 0.34957384405320735, "learning_rate": 3.572737266397948e-05, "loss": 0.4699, "step": 1083 }, { "epoch": 1.0722057368941642, "grad_norm": 0.4051017779258357, "learning_rate": 3.5709050934408214e-05, "loss": 0.4751, "step": 1084 }, { "epoch": 1.073194856577646, "grad_norm": 0.3301743394383747, "learning_rate": 3.5690729204836934e-05, "loss": 0.4765, "step": 1085 }, { "epoch": 1.0741839762611276, "grad_norm": 0.37660803140457133, "learning_rate": 3.567240747526567e-05, "loss": 0.4697, "step": 1086 }, { "epoch": 1.0751730959446093, "grad_norm": 0.33541330344202874, "learning_rate": 3.565408574569439e-05, "loss": 0.3767, "step": 1087 }, { "epoch": 1.076162215628091, "grad_norm": 0.31431860272850104, "learning_rate": 3.5635764016123127e-05, "loss": 0.4464, "step": 1088 }, { "epoch": 1.0771513353115727, "grad_norm": 0.3822236919960425, "learning_rate": 3.561744228655185e-05, "loss": 0.417, "step": 1089 }, { "epoch": 1.0781404549950544, "grad_norm": 0.3436590951082615, "learning_rate": 3.559912055698058e-05, "loss": 0.4275, "step": 1090 }, { "epoch": 1.079129574678536, "grad_norm": 0.3085576056896924, "learning_rate": 3.558079882740931e-05, "loss": 0.3597, "step": 1091 }, { "epoch": 1.0801186943620178, "grad_norm": 0.27756477292049936, "learning_rate": 3.556247709783804e-05, "loss": 0.3784, "step": 1092 }, { "epoch": 1.0811078140454995, "grad_norm": 0.41722858436924626, "learning_rate": 3.5544155368266765e-05, "loss": 0.4965, "step": 1093 }, { "epoch": 1.0820969337289812, "grad_norm": 0.3379090288767247, "learning_rate": 3.552583363869549e-05, "loss": 0.4804, "step": 1094 }, { "epoch": 1.083086053412463, "grad_norm": 0.352934575359697, "learning_rate": 3.5507511909124225e-05, "loss": 0.388, "step": 1095 }, { "epoch": 1.0840751730959446, "grad_norm": 2.8132481274950853, "learning_rate": 3.548919017955295e-05, "loss": 0.5017, "step": 1096 }, { "epoch": 1.0850642927794263, "grad_norm": 0.45069457909979543, "learning_rate": 3.547086844998168e-05, "loss": 0.4375, "step": 1097 }, { "epoch": 1.086053412462908, "grad_norm": 0.3169100654997534, "learning_rate": 3.5452546720410404e-05, "loss": 0.4386, "step": 1098 }, { "epoch": 1.0870425321463897, "grad_norm": 3.015261219455845, "learning_rate": 3.543422499083914e-05, "loss": 0.6241, "step": 1099 }, { "epoch": 1.0880316518298714, "grad_norm": 0.7258230030251859, "learning_rate": 3.5415903261267864e-05, "loss": 0.4545, "step": 1100 }, { "epoch": 1.089020771513353, "grad_norm": 0.3799505369853889, "learning_rate": 3.53975815316966e-05, "loss": 0.4984, "step": 1101 }, { "epoch": 1.0900098911968348, "grad_norm": 0.4198479739903225, "learning_rate": 3.537925980212532e-05, "loss": 0.4321, "step": 1102 }, { "epoch": 1.0909990108803165, "grad_norm": 0.44474996311910875, "learning_rate": 3.536093807255405e-05, "loss": 0.385, "step": 1103 }, { "epoch": 1.0919881305637982, "grad_norm": 0.34423848805072893, "learning_rate": 3.534261634298278e-05, "loss": 0.3839, "step": 1104 }, { "epoch": 1.0929772502472799, "grad_norm": 0.4385335696893382, "learning_rate": 3.532429461341151e-05, "loss": 0.5029, "step": 1105 }, { "epoch": 1.0939663699307616, "grad_norm": 0.44073911702739826, "learning_rate": 3.5305972883840236e-05, "loss": 0.4517, "step": 1106 }, { "epoch": 1.0949554896142433, "grad_norm": 0.3661390388337049, "learning_rate": 3.528765115426896e-05, "loss": 0.4409, "step": 1107 }, { "epoch": 1.095944609297725, "grad_norm": 0.3778751285178957, "learning_rate": 3.5269329424697696e-05, "loss": 0.4174, "step": 1108 }, { "epoch": 1.0969337289812067, "grad_norm": 0.3484208155588471, "learning_rate": 3.525100769512642e-05, "loss": 0.4595, "step": 1109 }, { "epoch": 1.0979228486646884, "grad_norm": 0.3721575495834725, "learning_rate": 3.523268596555515e-05, "loss": 0.4108, "step": 1110 }, { "epoch": 1.09891196834817, "grad_norm": 0.3260714299366415, "learning_rate": 3.5214364235983875e-05, "loss": 0.4622, "step": 1111 }, { "epoch": 1.0999010880316518, "grad_norm": 0.3354899051869184, "learning_rate": 3.519604250641261e-05, "loss": 0.4113, "step": 1112 }, { "epoch": 1.1008902077151335, "grad_norm": 0.2888023111039527, "learning_rate": 3.5177720776841334e-05, "loss": 0.3783, "step": 1113 }, { "epoch": 1.1018793273986152, "grad_norm": 0.3379822216843918, "learning_rate": 3.515939904727006e-05, "loss": 0.5117, "step": 1114 }, { "epoch": 1.1028684470820969, "grad_norm": 0.3591859612245374, "learning_rate": 3.5141077317698794e-05, "loss": 0.487, "step": 1115 }, { "epoch": 1.1038575667655786, "grad_norm": 0.30210957464850935, "learning_rate": 3.512275558812752e-05, "loss": 0.4574, "step": 1116 }, { "epoch": 1.1048466864490603, "grad_norm": 0.3056806916327237, "learning_rate": 3.5104433858556254e-05, "loss": 0.4382, "step": 1117 }, { "epoch": 1.105835806132542, "grad_norm": 0.3406702445819911, "learning_rate": 3.508611212898498e-05, "loss": 0.4476, "step": 1118 }, { "epoch": 1.1068249258160237, "grad_norm": 0.31733420049096794, "learning_rate": 3.5067790399413706e-05, "loss": 0.454, "step": 1119 }, { "epoch": 1.1078140454995054, "grad_norm": 0.3577310727863552, "learning_rate": 3.504946866984243e-05, "loss": 0.486, "step": 1120 }, { "epoch": 1.108803165182987, "grad_norm": 0.39669890919733924, "learning_rate": 3.5031146940271166e-05, "loss": 0.5032, "step": 1121 }, { "epoch": 1.1097922848664687, "grad_norm": 0.36109756118213504, "learning_rate": 3.501282521069989e-05, "loss": 0.4855, "step": 1122 }, { "epoch": 1.1107814045499504, "grad_norm": 0.41157327887209877, "learning_rate": 3.499450348112862e-05, "loss": 0.492, "step": 1123 }, { "epoch": 1.1117705242334321, "grad_norm": 0.8883017039989023, "learning_rate": 3.4976181751557345e-05, "loss": 0.4755, "step": 1124 }, { "epoch": 1.1127596439169138, "grad_norm": 0.4045680018041389, "learning_rate": 3.495786002198608e-05, "loss": 0.5405, "step": 1125 }, { "epoch": 1.1137487636003955, "grad_norm": 0.46285838105173954, "learning_rate": 3.4939538292414805e-05, "loss": 0.4135, "step": 1126 }, { "epoch": 1.1147378832838775, "grad_norm": 0.4009969222725594, "learning_rate": 3.492121656284353e-05, "loss": 0.4772, "step": 1127 }, { "epoch": 1.115727002967359, "grad_norm": 0.3744825716220794, "learning_rate": 3.4902894833272265e-05, "loss": 0.4132, "step": 1128 }, { "epoch": 1.1167161226508409, "grad_norm": 0.4811370559022165, "learning_rate": 3.488457310370099e-05, "loss": 0.4486, "step": 1129 }, { "epoch": 1.1177052423343223, "grad_norm": 0.32402321215164476, "learning_rate": 3.4866251374129724e-05, "loss": 0.4404, "step": 1130 }, { "epoch": 1.1186943620178043, "grad_norm": 0.35736672487594145, "learning_rate": 3.4847929644558444e-05, "loss": 0.4517, "step": 1131 }, { "epoch": 1.119683481701286, "grad_norm": 0.43284952844544416, "learning_rate": 3.482960791498718e-05, "loss": 0.432, "step": 1132 }, { "epoch": 1.1206726013847677, "grad_norm": 0.33367942093658537, "learning_rate": 3.4811286185415903e-05, "loss": 0.4439, "step": 1133 }, { "epoch": 1.1216617210682494, "grad_norm": 0.34533013092078974, "learning_rate": 3.479296445584464e-05, "loss": 0.4537, "step": 1134 }, { "epoch": 1.122650840751731, "grad_norm": 0.4103596180059782, "learning_rate": 3.477464272627336e-05, "loss": 0.4401, "step": 1135 }, { "epoch": 1.1236399604352127, "grad_norm": 0.3508858030722054, "learning_rate": 3.475632099670209e-05, "loss": 0.4513, "step": 1136 }, { "epoch": 1.1246290801186944, "grad_norm": 0.2644357297408127, "learning_rate": 3.4737999267130816e-05, "loss": 0.4285, "step": 1137 }, { "epoch": 1.1256181998021761, "grad_norm": 0.8993722168956318, "learning_rate": 3.471967753755955e-05, "loss": 0.5024, "step": 1138 }, { "epoch": 1.1266073194856578, "grad_norm": 0.40831707081279583, "learning_rate": 3.4701355807988275e-05, "loss": 0.455, "step": 1139 }, { "epoch": 1.1275964391691395, "grad_norm": 0.3182793210526495, "learning_rate": 3.4683034078417e-05, "loss": 0.4162, "step": 1140 }, { "epoch": 1.1285855588526212, "grad_norm": 0.3821840881244759, "learning_rate": 3.4664712348845735e-05, "loss": 0.4608, "step": 1141 }, { "epoch": 1.129574678536103, "grad_norm": 0.4427568558577197, "learning_rate": 3.464639061927446e-05, "loss": 0.5133, "step": 1142 }, { "epoch": 1.1305637982195846, "grad_norm": 0.9566817035370797, "learning_rate": 3.462806888970319e-05, "loss": 0.4638, "step": 1143 }, { "epoch": 1.1315529179030663, "grad_norm": 0.6038194624696839, "learning_rate": 3.4609747160131914e-05, "loss": 0.3681, "step": 1144 }, { "epoch": 1.132542037586548, "grad_norm": 0.6807613022411125, "learning_rate": 3.459142543056065e-05, "loss": 0.4679, "step": 1145 }, { "epoch": 1.1335311572700297, "grad_norm": 0.4115234393020162, "learning_rate": 3.4573103700989374e-05, "loss": 0.4422, "step": 1146 }, { "epoch": 1.1345202769535114, "grad_norm": 0.4341844759408288, "learning_rate": 3.455478197141811e-05, "loss": 0.4423, "step": 1147 }, { "epoch": 1.1355093966369931, "grad_norm": 0.5051556302844019, "learning_rate": 3.453646024184683e-05, "loss": 0.4541, "step": 1148 }, { "epoch": 1.1364985163204748, "grad_norm": 0.3758242355747177, "learning_rate": 3.451813851227556e-05, "loss": 0.4276, "step": 1149 }, { "epoch": 1.1374876360039565, "grad_norm": 0.3320259649587403, "learning_rate": 3.4499816782704286e-05, "loss": 0.4316, "step": 1150 }, { "epoch": 1.1384767556874382, "grad_norm": 0.45695875442585837, "learning_rate": 3.448149505313302e-05, "loss": 0.4378, "step": 1151 }, { "epoch": 1.13946587537092, "grad_norm": 0.43478750136447, "learning_rate": 3.4463173323561746e-05, "loss": 0.5218, "step": 1152 }, { "epoch": 1.1404549950544016, "grad_norm": 0.8782929825237592, "learning_rate": 3.444485159399047e-05, "loss": 0.4065, "step": 1153 }, { "epoch": 1.1414441147378833, "grad_norm": 0.48626073683964544, "learning_rate": 3.4426529864419206e-05, "loss": 0.4389, "step": 1154 }, { "epoch": 1.142433234421365, "grad_norm": 0.49062321657072966, "learning_rate": 3.440820813484793e-05, "loss": 0.4402, "step": 1155 }, { "epoch": 1.1434223541048467, "grad_norm": 3.9259528695103088, "learning_rate": 3.438988640527666e-05, "loss": 0.5033, "step": 1156 }, { "epoch": 1.1444114737883284, "grad_norm": 0.4908247634768901, "learning_rate": 3.4371564675705385e-05, "loss": 0.4815, "step": 1157 }, { "epoch": 1.1454005934718101, "grad_norm": 0.44304978825609653, "learning_rate": 3.435324294613412e-05, "loss": 0.3978, "step": 1158 }, { "epoch": 1.1463897131552918, "grad_norm": 0.32374507014467085, "learning_rate": 3.4334921216562844e-05, "loss": 0.4137, "step": 1159 }, { "epoch": 1.1473788328387735, "grad_norm": 0.42036545626914656, "learning_rate": 3.431659948699157e-05, "loss": 0.4604, "step": 1160 }, { "epoch": 1.1483679525222552, "grad_norm": 0.4166874208449986, "learning_rate": 3.42982777574203e-05, "loss": 0.4776, "step": 1161 }, { "epoch": 1.149357072205737, "grad_norm": 0.31668752094571023, "learning_rate": 3.427995602784903e-05, "loss": 0.4456, "step": 1162 }, { "epoch": 1.1503461918892186, "grad_norm": 0.43054522427624803, "learning_rate": 3.4261634298277764e-05, "loss": 0.443, "step": 1163 }, { "epoch": 1.1513353115727003, "grad_norm": 0.42615533383169335, "learning_rate": 3.424331256870649e-05, "loss": 0.4032, "step": 1164 }, { "epoch": 1.152324431256182, "grad_norm": 0.3387153691463948, "learning_rate": 3.4224990839135217e-05, "loss": 0.4377, "step": 1165 }, { "epoch": 1.1533135509396637, "grad_norm": 0.36898697794603996, "learning_rate": 3.420666910956394e-05, "loss": 0.3753, "step": 1166 }, { "epoch": 1.1543026706231454, "grad_norm": 0.3421939288510364, "learning_rate": 3.4188347379992676e-05, "loss": 0.4382, "step": 1167 }, { "epoch": 1.155291790306627, "grad_norm": 0.31926334513776805, "learning_rate": 3.41700256504214e-05, "loss": 0.4165, "step": 1168 }, { "epoch": 1.1562809099901088, "grad_norm": 0.40484584927634126, "learning_rate": 3.415170392085013e-05, "loss": 0.4012, "step": 1169 }, { "epoch": 1.1572700296735905, "grad_norm": 0.38104083917952464, "learning_rate": 3.4133382191278855e-05, "loss": 0.4316, "step": 1170 }, { "epoch": 1.1582591493570722, "grad_norm": 0.3709082882174054, "learning_rate": 3.411506046170759e-05, "loss": 0.4243, "step": 1171 }, { "epoch": 1.159248269040554, "grad_norm": 0.38180703630514035, "learning_rate": 3.4096738732136315e-05, "loss": 0.4204, "step": 1172 }, { "epoch": 1.1602373887240356, "grad_norm": 0.35066602597324176, "learning_rate": 3.407841700256504e-05, "loss": 0.4517, "step": 1173 }, { "epoch": 1.1612265084075173, "grad_norm": 0.3409612624272448, "learning_rate": 3.4060095272993775e-05, "loss": 0.4681, "step": 1174 }, { "epoch": 1.162215628090999, "grad_norm": 0.3242800885803111, "learning_rate": 3.40417735434225e-05, "loss": 0.5142, "step": 1175 }, { "epoch": 1.1632047477744807, "grad_norm": 0.3999158161657895, "learning_rate": 3.4023451813851234e-05, "loss": 0.4721, "step": 1176 }, { "epoch": 1.1641938674579624, "grad_norm": 0.3057813262892474, "learning_rate": 3.4005130084279954e-05, "loss": 0.4081, "step": 1177 }, { "epoch": 1.165182987141444, "grad_norm": 0.29265432398856944, "learning_rate": 3.398680835470869e-05, "loss": 0.4741, "step": 1178 }, { "epoch": 1.1661721068249258, "grad_norm": 0.35031812706583215, "learning_rate": 3.3968486625137414e-05, "loss": 0.4069, "step": 1179 }, { "epoch": 1.1671612265084075, "grad_norm": 0.3199265954315745, "learning_rate": 3.395016489556615e-05, "loss": 0.4153, "step": 1180 }, { "epoch": 1.1681503461918892, "grad_norm": 0.3322579671121011, "learning_rate": 3.393184316599487e-05, "loss": 0.4966, "step": 1181 }, { "epoch": 1.1691394658753709, "grad_norm": 0.31674223480365804, "learning_rate": 3.39135214364236e-05, "loss": 0.4019, "step": 1182 }, { "epoch": 1.1701285855588526, "grad_norm": 0.3452256992238975, "learning_rate": 3.3895199706852326e-05, "loss": 0.508, "step": 1183 }, { "epoch": 1.1711177052423343, "grad_norm": 0.3126306353162765, "learning_rate": 3.387687797728106e-05, "loss": 0.4411, "step": 1184 }, { "epoch": 1.172106824925816, "grad_norm": 0.3504430898459003, "learning_rate": 3.3858556247709786e-05, "loss": 0.4409, "step": 1185 }, { "epoch": 1.1730959446092977, "grad_norm": 0.32158432591053415, "learning_rate": 3.384023451813851e-05, "loss": 0.4216, "step": 1186 }, { "epoch": 1.1740850642927794, "grad_norm": 0.33575292344710606, "learning_rate": 3.3821912788567245e-05, "loss": 0.4653, "step": 1187 }, { "epoch": 1.175074183976261, "grad_norm": 0.2953528980384244, "learning_rate": 3.380359105899597e-05, "loss": 0.49, "step": 1188 }, { "epoch": 1.1760633036597428, "grad_norm": 0.44904372451288704, "learning_rate": 3.37852693294247e-05, "loss": 0.49, "step": 1189 }, { "epoch": 1.1770524233432245, "grad_norm": 0.3985352253667881, "learning_rate": 3.3766947599853424e-05, "loss": 0.4351, "step": 1190 }, { "epoch": 1.1780415430267062, "grad_norm": 0.36529013646478736, "learning_rate": 3.374862587028216e-05, "loss": 0.3994, "step": 1191 }, { "epoch": 1.1790306627101879, "grad_norm": 0.4181274052477301, "learning_rate": 3.3730304140710884e-05, "loss": 0.461, "step": 1192 }, { "epoch": 1.1800197823936696, "grad_norm": 0.37986297109107886, "learning_rate": 3.371198241113962e-05, "loss": 0.4425, "step": 1193 }, { "epoch": 1.1810089020771513, "grad_norm": 0.38662532285644546, "learning_rate": 3.369366068156834e-05, "loss": 0.4557, "step": 1194 }, { "epoch": 1.181998021760633, "grad_norm": 0.320518534529602, "learning_rate": 3.367533895199707e-05, "loss": 0.387, "step": 1195 }, { "epoch": 1.1829871414441147, "grad_norm": 0.3794192278894525, "learning_rate": 3.3657017222425796e-05, "loss": 0.4355, "step": 1196 }, { "epoch": 1.1839762611275964, "grad_norm": 0.3258255894383896, "learning_rate": 3.363869549285453e-05, "loss": 0.4617, "step": 1197 }, { "epoch": 1.184965380811078, "grad_norm": 0.3753454976012276, "learning_rate": 3.3620373763283256e-05, "loss": 0.4796, "step": 1198 }, { "epoch": 1.1859545004945597, "grad_norm": 0.31066105843247516, "learning_rate": 3.360205203371198e-05, "loss": 0.4841, "step": 1199 }, { "epoch": 1.1869436201780414, "grad_norm": 0.37065175137983325, "learning_rate": 3.3583730304140716e-05, "loss": 0.4436, "step": 1200 }, { "epoch": 1.1879327398615231, "grad_norm": 0.38153937820459927, "learning_rate": 3.356540857456944e-05, "loss": 0.4449, "step": 1201 }, { "epoch": 1.188921859545005, "grad_norm": 0.31499452837981, "learning_rate": 3.354708684499817e-05, "loss": 0.3774, "step": 1202 }, { "epoch": 1.1899109792284865, "grad_norm": 0.36683386599623874, "learning_rate": 3.3528765115426895e-05, "loss": 0.4489, "step": 1203 }, { "epoch": 1.1909000989119685, "grad_norm": 0.3929092276173119, "learning_rate": 3.351044338585563e-05, "loss": 0.4983, "step": 1204 }, { "epoch": 1.19188921859545, "grad_norm": 0.33361040009672116, "learning_rate": 3.3492121656284355e-05, "loss": 0.404, "step": 1205 }, { "epoch": 1.1928783382789319, "grad_norm": 0.32846256380043704, "learning_rate": 3.347379992671308e-05, "loss": 0.4868, "step": 1206 }, { "epoch": 1.1938674579624133, "grad_norm": 0.3340382987952063, "learning_rate": 3.345547819714181e-05, "loss": 0.4268, "step": 1207 }, { "epoch": 1.1948565776458953, "grad_norm": 0.37162462421580916, "learning_rate": 3.343715646757054e-05, "loss": 0.4391, "step": 1208 }, { "epoch": 1.1958456973293767, "grad_norm": 0.3365741665506994, "learning_rate": 3.341883473799927e-05, "loss": 0.4853, "step": 1209 }, { "epoch": 1.1968348170128587, "grad_norm": 0.38473036798920535, "learning_rate": 3.3400513008428e-05, "loss": 0.4402, "step": 1210 }, { "epoch": 1.1978239366963404, "grad_norm": 0.2943755821058674, "learning_rate": 3.338219127885673e-05, "loss": 0.4238, "step": 1211 }, { "epoch": 1.198813056379822, "grad_norm": 0.3355562973386804, "learning_rate": 3.336386954928545e-05, "loss": 0.3917, "step": 1212 }, { "epoch": 1.1998021760633037, "grad_norm": 0.317119955078073, "learning_rate": 3.3345547819714186e-05, "loss": 0.4691, "step": 1213 }, { "epoch": 1.2007912957467854, "grad_norm": 0.33025445538947157, "learning_rate": 3.332722609014291e-05, "loss": 0.5038, "step": 1214 }, { "epoch": 1.2017804154302671, "grad_norm": 0.3340396874499203, "learning_rate": 3.330890436057164e-05, "loss": 0.4274, "step": 1215 }, { "epoch": 1.2027695351137488, "grad_norm": 0.34655165596355814, "learning_rate": 3.3290582631000365e-05, "loss": 0.4066, "step": 1216 }, { "epoch": 1.2037586547972305, "grad_norm": 0.6174225922607776, "learning_rate": 3.32722609014291e-05, "loss": 0.4073, "step": 1217 }, { "epoch": 1.2047477744807122, "grad_norm": 0.3327762274517774, "learning_rate": 3.3253939171857825e-05, "loss": 0.446, "step": 1218 }, { "epoch": 1.205736894164194, "grad_norm": 0.32324569403115144, "learning_rate": 3.323561744228655e-05, "loss": 0.3973, "step": 1219 }, { "epoch": 1.2067260138476756, "grad_norm": 0.3183062423357114, "learning_rate": 3.321729571271528e-05, "loss": 0.3924, "step": 1220 }, { "epoch": 1.2077151335311573, "grad_norm": 0.3100295021920569, "learning_rate": 3.319897398314401e-05, "loss": 0.4126, "step": 1221 }, { "epoch": 1.208704253214639, "grad_norm": 0.3373361154076598, "learning_rate": 3.3180652253572744e-05, "loss": 0.4281, "step": 1222 }, { "epoch": 1.2096933728981207, "grad_norm": 0.3765180938062049, "learning_rate": 3.3162330524001464e-05, "loss": 0.5078, "step": 1223 }, { "epoch": 1.2106824925816024, "grad_norm": 0.3143717459812389, "learning_rate": 3.31440087944302e-05, "loss": 0.4406, "step": 1224 }, { "epoch": 1.2116716122650841, "grad_norm": 0.43796853097237903, "learning_rate": 3.3125687064858924e-05, "loss": 0.4843, "step": 1225 }, { "epoch": 1.2126607319485658, "grad_norm": 0.27756885414505256, "learning_rate": 3.310736533528766e-05, "loss": 0.3495, "step": 1226 }, { "epoch": 1.2136498516320475, "grad_norm": 0.34846044043895186, "learning_rate": 3.308904360571638e-05, "loss": 0.4714, "step": 1227 }, { "epoch": 1.2146389713155292, "grad_norm": 0.2792141653842612, "learning_rate": 3.307072187614511e-05, "loss": 0.435, "step": 1228 }, { "epoch": 1.215628090999011, "grad_norm": 0.3247383857479883, "learning_rate": 3.3052400146573836e-05, "loss": 0.4369, "step": 1229 }, { "epoch": 1.2166172106824926, "grad_norm": 0.31934633509474236, "learning_rate": 3.303407841700257e-05, "loss": 0.4247, "step": 1230 }, { "epoch": 1.2176063303659743, "grad_norm": 0.3117420006063186, "learning_rate": 3.3015756687431296e-05, "loss": 0.4221, "step": 1231 }, { "epoch": 1.218595450049456, "grad_norm": 0.37143959870180393, "learning_rate": 3.299743495786002e-05, "loss": 0.4451, "step": 1232 }, { "epoch": 1.2195845697329377, "grad_norm": 0.2747122535928208, "learning_rate": 3.297911322828875e-05, "loss": 0.4168, "step": 1233 }, { "epoch": 1.2205736894164194, "grad_norm": 0.32821875211096874, "learning_rate": 3.296079149871748e-05, "loss": 0.4876, "step": 1234 }, { "epoch": 1.2215628090999011, "grad_norm": 0.2848558407621968, "learning_rate": 3.294246976914621e-05, "loss": 0.4402, "step": 1235 }, { "epoch": 1.2225519287833828, "grad_norm": 0.2885008544507463, "learning_rate": 3.2924148039574935e-05, "loss": 0.4349, "step": 1236 }, { "epoch": 1.2235410484668645, "grad_norm": 0.33320909099352525, "learning_rate": 3.290582631000367e-05, "loss": 0.4433, "step": 1237 }, { "epoch": 1.2245301681503462, "grad_norm": 0.25857871277266625, "learning_rate": 3.2887504580432394e-05, "loss": 0.4339, "step": 1238 }, { "epoch": 1.225519287833828, "grad_norm": 0.32250835248537585, "learning_rate": 3.286918285086113e-05, "loss": 0.4378, "step": 1239 }, { "epoch": 1.2265084075173096, "grad_norm": 0.30737128663542473, "learning_rate": 3.285086112128985e-05, "loss": 0.4577, "step": 1240 }, { "epoch": 1.2274975272007913, "grad_norm": 0.2742545175524849, "learning_rate": 3.283253939171858e-05, "loss": 0.428, "step": 1241 }, { "epoch": 1.228486646884273, "grad_norm": 0.28951560409808336, "learning_rate": 3.2814217662147307e-05, "loss": 0.3863, "step": 1242 }, { "epoch": 1.2294757665677547, "grad_norm": 0.7199122350538452, "learning_rate": 3.279589593257604e-05, "loss": 0.4365, "step": 1243 }, { "epoch": 1.2304648862512364, "grad_norm": 0.25600755117681134, "learning_rate": 3.277757420300476e-05, "loss": 0.4074, "step": 1244 }, { "epoch": 1.231454005934718, "grad_norm": 0.8811383887062425, "learning_rate": 3.275925247343349e-05, "loss": 0.481, "step": 1245 }, { "epoch": 1.2324431256181998, "grad_norm": 0.3023345814779684, "learning_rate": 3.2740930743862226e-05, "loss": 0.4768, "step": 1246 }, { "epoch": 1.2334322453016815, "grad_norm": 0.2755925407169624, "learning_rate": 3.272260901429095e-05, "loss": 0.4027, "step": 1247 }, { "epoch": 1.2344213649851632, "grad_norm": 11.463153475362521, "learning_rate": 3.270428728471968e-05, "loss": 0.4636, "step": 1248 }, { "epoch": 1.235410484668645, "grad_norm": 0.45187999656619815, "learning_rate": 3.2685965555148405e-05, "loss": 0.4927, "step": 1249 }, { "epoch": 1.2363996043521266, "grad_norm": 0.32892957675978074, "learning_rate": 3.266764382557714e-05, "loss": 0.4548, "step": 1250 }, { "epoch": 1.2373887240356083, "grad_norm": 0.3303929130848598, "learning_rate": 3.2649322096005865e-05, "loss": 0.4709, "step": 1251 }, { "epoch": 1.23837784371909, "grad_norm": 0.3373802532562506, "learning_rate": 3.263100036643459e-05, "loss": 0.4582, "step": 1252 }, { "epoch": 1.2393669634025717, "grad_norm": 0.3779420909570119, "learning_rate": 3.261267863686332e-05, "loss": 0.4451, "step": 1253 }, { "epoch": 1.2403560830860534, "grad_norm": 0.291216947790094, "learning_rate": 3.259435690729205e-05, "loss": 0.4013, "step": 1254 }, { "epoch": 1.241345202769535, "grad_norm": 0.34985117379943337, "learning_rate": 3.257603517772078e-05, "loss": 0.4513, "step": 1255 }, { "epoch": 1.2423343224530168, "grad_norm": 0.31095271457516216, "learning_rate": 3.255771344814951e-05, "loss": 0.4411, "step": 1256 }, { "epoch": 1.2433234421364985, "grad_norm": 1.047735080448275, "learning_rate": 3.253939171857823e-05, "loss": 0.5013, "step": 1257 }, { "epoch": 1.2443125618199802, "grad_norm": 0.27825085687486434, "learning_rate": 3.252106998900696e-05, "loss": 0.4161, "step": 1258 }, { "epoch": 1.2453016815034619, "grad_norm": 0.3636283861995447, "learning_rate": 3.2502748259435696e-05, "loss": 0.5048, "step": 1259 }, { "epoch": 1.2462908011869436, "grad_norm": 0.2972461553564986, "learning_rate": 3.248442652986442e-05, "loss": 0.4577, "step": 1260 }, { "epoch": 1.2472799208704253, "grad_norm": 0.4663431964179042, "learning_rate": 3.246610480029315e-05, "loss": 0.4355, "step": 1261 }, { "epoch": 1.248269040553907, "grad_norm": 0.7010092875578634, "learning_rate": 3.2447783070721876e-05, "loss": 0.4501, "step": 1262 }, { "epoch": 1.2492581602373887, "grad_norm": 0.301261398514134, "learning_rate": 3.242946134115061e-05, "loss": 0.4805, "step": 1263 }, { "epoch": 1.2502472799208704, "grad_norm": 0.3418549326196357, "learning_rate": 3.2411139611579335e-05, "loss": 0.4795, "step": 1264 }, { "epoch": 1.251236399604352, "grad_norm": 0.29526440353783767, "learning_rate": 3.239281788200806e-05, "loss": 0.4332, "step": 1265 }, { "epoch": 1.2522255192878338, "grad_norm": 0.2967530321345406, "learning_rate": 3.237449615243679e-05, "loss": 0.3992, "step": 1266 }, { "epoch": 1.2532146389713155, "grad_norm": 0.30327817973420274, "learning_rate": 3.235617442286552e-05, "loss": 0.4832, "step": 1267 }, { "epoch": 1.2542037586547972, "grad_norm": 0.28390309436771854, "learning_rate": 3.233785269329425e-05, "loss": 0.394, "step": 1268 }, { "epoch": 1.2551928783382789, "grad_norm": 0.3640976735315917, "learning_rate": 3.2319530963722974e-05, "loss": 0.4319, "step": 1269 }, { "epoch": 1.2561819980217606, "grad_norm": 0.3039414854494338, "learning_rate": 3.230120923415171e-05, "loss": 0.4777, "step": 1270 }, { "epoch": 1.2571711177052423, "grad_norm": 0.2797669473882524, "learning_rate": 3.2282887504580434e-05, "loss": 0.4259, "step": 1271 }, { "epoch": 1.258160237388724, "grad_norm": 0.3431193474424225, "learning_rate": 3.226456577500917e-05, "loss": 0.4484, "step": 1272 }, { "epoch": 1.2591493570722057, "grad_norm": 0.2748018185129936, "learning_rate": 3.224624404543789e-05, "loss": 0.4019, "step": 1273 }, { "epoch": 1.2601384767556874, "grad_norm": 0.3078022305195795, "learning_rate": 3.222792231586662e-05, "loss": 0.4357, "step": 1274 }, { "epoch": 1.2611275964391693, "grad_norm": 0.28817276925435575, "learning_rate": 3.2209600586295346e-05, "loss": 0.4581, "step": 1275 }, { "epoch": 1.2621167161226508, "grad_norm": 0.28915115293238075, "learning_rate": 3.219127885672408e-05, "loss": 0.4113, "step": 1276 }, { "epoch": 1.2631058358061327, "grad_norm": 0.29431101862681536, "learning_rate": 3.2172957127152806e-05, "loss": 0.4844, "step": 1277 }, { "epoch": 1.2640949554896141, "grad_norm": 0.31417644441909515, "learning_rate": 3.215463539758153e-05, "loss": 0.4359, "step": 1278 }, { "epoch": 1.265084075173096, "grad_norm": 0.2932996151523536, "learning_rate": 3.213631366801026e-05, "loss": 0.4566, "step": 1279 }, { "epoch": 1.2660731948565775, "grad_norm": 0.3544874502119, "learning_rate": 3.211799193843899e-05, "loss": 0.4528, "step": 1280 }, { "epoch": 1.2670623145400595, "grad_norm": 0.31431737172579327, "learning_rate": 3.209967020886772e-05, "loss": 0.4873, "step": 1281 }, { "epoch": 1.268051434223541, "grad_norm": 0.32083957972618893, "learning_rate": 3.2081348479296445e-05, "loss": 0.4405, "step": 1282 }, { "epoch": 1.2690405539070229, "grad_norm": 0.28531911086752, "learning_rate": 3.206302674972518e-05, "loss": 0.4763, "step": 1283 }, { "epoch": 1.2700296735905043, "grad_norm": 0.39038530630890933, "learning_rate": 3.2044705020153904e-05, "loss": 0.487, "step": 1284 }, { "epoch": 1.2710187932739863, "grad_norm": 0.3231694114187269, "learning_rate": 3.202638329058264e-05, "loss": 0.4575, "step": 1285 }, { "epoch": 1.2720079129574677, "grad_norm": 0.39483284030242527, "learning_rate": 3.200806156101136e-05, "loss": 0.5078, "step": 1286 }, { "epoch": 1.2729970326409497, "grad_norm": 0.2624573775627927, "learning_rate": 3.198973983144009e-05, "loss": 0.4189, "step": 1287 }, { "epoch": 1.2739861523244311, "grad_norm": 0.3539064045837323, "learning_rate": 3.197141810186882e-05, "loss": 0.4083, "step": 1288 }, { "epoch": 1.274975272007913, "grad_norm": 0.41113892601872426, "learning_rate": 3.195309637229755e-05, "loss": 0.4359, "step": 1289 }, { "epoch": 1.2759643916913945, "grad_norm": 0.3263835404973625, "learning_rate": 3.193477464272627e-05, "loss": 0.4466, "step": 1290 }, { "epoch": 1.2769535113748764, "grad_norm": 0.3521424288990003, "learning_rate": 3.1916452913155e-05, "loss": 0.4346, "step": 1291 }, { "epoch": 1.277942631058358, "grad_norm": 0.32960482803892793, "learning_rate": 3.189813118358373e-05, "loss": 0.4081, "step": 1292 }, { "epoch": 1.2789317507418398, "grad_norm": 0.37621318875682574, "learning_rate": 3.187980945401246e-05, "loss": 0.5071, "step": 1293 }, { "epoch": 1.2799208704253215, "grad_norm": 0.2842212763877933, "learning_rate": 3.186148772444119e-05, "loss": 0.4383, "step": 1294 }, { "epoch": 1.2809099901088032, "grad_norm": 0.4185836752845537, "learning_rate": 3.1843165994869915e-05, "loss": 0.4605, "step": 1295 }, { "epoch": 1.281899109792285, "grad_norm": 0.2864590173818887, "learning_rate": 3.182484426529865e-05, "loss": 0.4115, "step": 1296 }, { "epoch": 1.2828882294757666, "grad_norm": 0.31251818275646565, "learning_rate": 3.1806522535727375e-05, "loss": 0.4749, "step": 1297 }, { "epoch": 1.2838773491592483, "grad_norm": 0.40375308662312326, "learning_rate": 3.17882008061561e-05, "loss": 0.4805, "step": 1298 }, { "epoch": 1.28486646884273, "grad_norm": 0.2787733861619942, "learning_rate": 3.176987907658483e-05, "loss": 0.4695, "step": 1299 }, { "epoch": 1.2858555885262117, "grad_norm": 0.3390155103152818, "learning_rate": 3.175155734701356e-05, "loss": 0.4799, "step": 1300 }, { "epoch": 1.2868447082096934, "grad_norm": 0.39915923407897047, "learning_rate": 3.173323561744229e-05, "loss": 0.4911, "step": 1301 }, { "epoch": 1.2878338278931751, "grad_norm": 0.2829054197112506, "learning_rate": 3.171491388787102e-05, "loss": 0.3704, "step": 1302 }, { "epoch": 1.2888229475766568, "grad_norm": 0.3354351078071799, "learning_rate": 3.169659215829974e-05, "loss": 0.4108, "step": 1303 }, { "epoch": 1.2898120672601385, "grad_norm": 0.2977538229373433, "learning_rate": 3.167827042872847e-05, "loss": 0.4279, "step": 1304 }, { "epoch": 1.2908011869436202, "grad_norm": 0.328567314688651, "learning_rate": 3.16599486991572e-05, "loss": 0.4056, "step": 1305 }, { "epoch": 1.291790306627102, "grad_norm": 0.3262674751990915, "learning_rate": 3.164162696958593e-05, "loss": 0.4404, "step": 1306 }, { "epoch": 1.2927794263105836, "grad_norm": 0.38924960367345507, "learning_rate": 3.162330524001466e-05, "loss": 0.4364, "step": 1307 }, { "epoch": 1.2937685459940653, "grad_norm": 0.28787499890259116, "learning_rate": 3.1604983510443386e-05, "loss": 0.5439, "step": 1308 }, { "epoch": 1.294757665677547, "grad_norm": 0.3356946307141818, "learning_rate": 3.158666178087212e-05, "loss": 0.4324, "step": 1309 }, { "epoch": 1.2957467853610287, "grad_norm": 0.4879317138260975, "learning_rate": 3.1568340051300845e-05, "loss": 0.4807, "step": 1310 }, { "epoch": 1.2967359050445104, "grad_norm": 0.2955688732632104, "learning_rate": 3.155001832172957e-05, "loss": 0.4036, "step": 1311 }, { "epoch": 1.2977250247279921, "grad_norm": 0.4032945847602782, "learning_rate": 3.15316965921583e-05, "loss": 0.4401, "step": 1312 }, { "epoch": 1.2987141444114738, "grad_norm": 0.3717861992107911, "learning_rate": 3.151337486258703e-05, "loss": 0.412, "step": 1313 }, { "epoch": 1.2997032640949555, "grad_norm": 0.36230212240482274, "learning_rate": 3.149505313301576e-05, "loss": 0.4756, "step": 1314 }, { "epoch": 1.3006923837784372, "grad_norm": 0.3219831393538455, "learning_rate": 3.1476731403444484e-05, "loss": 0.4307, "step": 1315 }, { "epoch": 1.301681503461919, "grad_norm": 0.2980114239679098, "learning_rate": 3.145840967387321e-05, "loss": 0.3987, "step": 1316 }, { "epoch": 1.3026706231454006, "grad_norm": 0.8151806056234506, "learning_rate": 3.1440087944301944e-05, "loss": 0.4737, "step": 1317 }, { "epoch": 1.3036597428288823, "grad_norm": 0.4605566888952793, "learning_rate": 3.142176621473068e-05, "loss": 0.4641, "step": 1318 }, { "epoch": 1.304648862512364, "grad_norm": 0.3980859249064098, "learning_rate": 3.14034444851594e-05, "loss": 0.459, "step": 1319 }, { "epoch": 1.3056379821958457, "grad_norm": 0.2841614140383684, "learning_rate": 3.138512275558813e-05, "loss": 0.4057, "step": 1320 }, { "epoch": 1.3066271018793274, "grad_norm": 1.9073972336742402, "learning_rate": 3.1366801026016856e-05, "loss": 0.4767, "step": 1321 }, { "epoch": 1.307616221562809, "grad_norm": 0.5592709384548382, "learning_rate": 3.134847929644559e-05, "loss": 0.4819, "step": 1322 }, { "epoch": 1.3086053412462908, "grad_norm": 0.3222748433424541, "learning_rate": 3.1330157566874316e-05, "loss": 0.4371, "step": 1323 }, { "epoch": 1.3095944609297725, "grad_norm": 0.4650918381065197, "learning_rate": 3.131183583730304e-05, "loss": 0.415, "step": 1324 }, { "epoch": 1.3105835806132542, "grad_norm": 0.4224117379256276, "learning_rate": 3.129351410773177e-05, "loss": 0.4566, "step": 1325 }, { "epoch": 1.311572700296736, "grad_norm": 0.46207236337386226, "learning_rate": 3.12751923781605e-05, "loss": 0.5005, "step": 1326 }, { "epoch": 1.3125618199802176, "grad_norm": 0.4683411564216293, "learning_rate": 3.125687064858923e-05, "loss": 0.4825, "step": 1327 }, { "epoch": 1.3135509396636993, "grad_norm": 0.40426282504932587, "learning_rate": 3.1238548919017955e-05, "loss": 0.4031, "step": 1328 }, { "epoch": 1.314540059347181, "grad_norm": 0.4937917446065795, "learning_rate": 3.122022718944669e-05, "loss": 0.4375, "step": 1329 }, { "epoch": 1.3155291790306627, "grad_norm": 0.44951168903509353, "learning_rate": 3.1201905459875414e-05, "loss": 0.4931, "step": 1330 }, { "epoch": 1.3165182987141444, "grad_norm": 0.391635440933602, "learning_rate": 3.118358373030415e-05, "loss": 0.3879, "step": 1331 }, { "epoch": 1.317507418397626, "grad_norm": 0.5580933846109488, "learning_rate": 3.116526200073287e-05, "loss": 0.4064, "step": 1332 }, { "epoch": 1.3184965380811078, "grad_norm": 0.26495237831546364, "learning_rate": 3.11469402711616e-05, "loss": 0.3427, "step": 1333 }, { "epoch": 1.3194856577645895, "grad_norm": 0.4754556883385079, "learning_rate": 3.112861854159033e-05, "loss": 0.4223, "step": 1334 }, { "epoch": 1.3204747774480712, "grad_norm": 0.3881819412366177, "learning_rate": 3.111029681201906e-05, "loss": 0.4623, "step": 1335 }, { "epoch": 1.3214638971315529, "grad_norm": 0.27166678881531925, "learning_rate": 3.109197508244778e-05, "loss": 0.4131, "step": 1336 }, { "epoch": 1.3224530168150346, "grad_norm": 0.40190887482300114, "learning_rate": 3.107365335287651e-05, "loss": 0.492, "step": 1337 }, { "epoch": 1.3234421364985163, "grad_norm": 0.3539039447670423, "learning_rate": 3.105533162330524e-05, "loss": 0.4205, "step": 1338 }, { "epoch": 1.324431256181998, "grad_norm": 0.35944898679004156, "learning_rate": 3.103700989373397e-05, "loss": 0.4791, "step": 1339 }, { "epoch": 1.3254203758654797, "grad_norm": 0.405897222512928, "learning_rate": 3.10186881641627e-05, "loss": 0.5133, "step": 1340 }, { "epoch": 1.3264094955489614, "grad_norm": 0.3149489860402367, "learning_rate": 3.1000366434591425e-05, "loss": 0.4603, "step": 1341 }, { "epoch": 1.327398615232443, "grad_norm": 0.45252664436277323, "learning_rate": 3.098204470502016e-05, "loss": 0.5322, "step": 1342 }, { "epoch": 1.3283877349159248, "grad_norm": 0.2880720148908769, "learning_rate": 3.0963722975448885e-05, "loss": 0.4097, "step": 1343 }, { "epoch": 1.3293768545994065, "grad_norm": 0.43858458830839114, "learning_rate": 3.094540124587761e-05, "loss": 0.4331, "step": 1344 }, { "epoch": 1.3303659742828882, "grad_norm": 0.2965447563557795, "learning_rate": 3.092707951630634e-05, "loss": 0.4524, "step": 1345 }, { "epoch": 1.3313550939663699, "grad_norm": 0.3131743992651659, "learning_rate": 3.090875778673507e-05, "loss": 0.4266, "step": 1346 }, { "epoch": 1.3323442136498516, "grad_norm": 0.35252494183868, "learning_rate": 3.08904360571638e-05, "loss": 0.4365, "step": 1347 }, { "epoch": 1.3333333333333333, "grad_norm": 0.3138151302405501, "learning_rate": 3.087211432759253e-05, "loss": 0.4451, "step": 1348 }, { "epoch": 1.334322453016815, "grad_norm": 0.3115085002074383, "learning_rate": 3.085379259802125e-05, "loss": 0.4425, "step": 1349 }, { "epoch": 1.3353115727002967, "grad_norm": 0.30006114039678367, "learning_rate": 3.083547086844998e-05, "loss": 0.3935, "step": 1350 }, { "epoch": 1.3363006923837784, "grad_norm": 0.28834125078240946, "learning_rate": 3.081714913887871e-05, "loss": 0.413, "step": 1351 }, { "epoch": 1.3372898120672603, "grad_norm": 0.3561705460960808, "learning_rate": 3.079882740930744e-05, "loss": 0.4968, "step": 1352 }, { "epoch": 1.3382789317507418, "grad_norm": 0.3260716628155875, "learning_rate": 3.078050567973617e-05, "loss": 0.4487, "step": 1353 }, { "epoch": 1.3392680514342237, "grad_norm": 0.32982555741060104, "learning_rate": 3.0762183950164896e-05, "loss": 0.441, "step": 1354 }, { "epoch": 1.3402571711177051, "grad_norm": 0.30807762347144296, "learning_rate": 3.074386222059363e-05, "loss": 0.4121, "step": 1355 }, { "epoch": 1.341246290801187, "grad_norm": 0.3532020602660648, "learning_rate": 3.0725540491022355e-05, "loss": 0.505, "step": 1356 }, { "epoch": 1.3422354104846685, "grad_norm": 0.2876611640436799, "learning_rate": 3.070721876145108e-05, "loss": 0.4096, "step": 1357 }, { "epoch": 1.3432245301681505, "grad_norm": 0.3186642457128235, "learning_rate": 3.068889703187981e-05, "loss": 0.4585, "step": 1358 }, { "epoch": 1.344213649851632, "grad_norm": 0.31673282289780236, "learning_rate": 3.067057530230854e-05, "loss": 0.4235, "step": 1359 }, { "epoch": 1.3452027695351139, "grad_norm": 0.35253205951974537, "learning_rate": 3.065225357273727e-05, "loss": 0.4569, "step": 1360 }, { "epoch": 1.3461918892185953, "grad_norm": 0.3349663982529888, "learning_rate": 3.0633931843165994e-05, "loss": 0.3936, "step": 1361 }, { "epoch": 1.3471810089020773, "grad_norm": 0.33498838963025845, "learning_rate": 3.061561011359472e-05, "loss": 0.4983, "step": 1362 }, { "epoch": 1.3481701285855587, "grad_norm": 0.3400374587506467, "learning_rate": 3.0597288384023454e-05, "loss": 0.4503, "step": 1363 }, { "epoch": 1.3491592482690407, "grad_norm": 0.3203012395834267, "learning_rate": 3.057896665445218e-05, "loss": 0.3944, "step": 1364 }, { "epoch": 1.3501483679525221, "grad_norm": 0.3920156373833764, "learning_rate": 3.0560644924880913e-05, "loss": 0.4813, "step": 1365 }, { "epoch": 1.351137487636004, "grad_norm": 0.30315401884490906, "learning_rate": 3.054232319530964e-05, "loss": 0.4454, "step": 1366 }, { "epoch": 1.3521266073194855, "grad_norm": 0.35739212453761815, "learning_rate": 3.0524001465738366e-05, "loss": 0.439, "step": 1367 }, { "epoch": 1.3531157270029674, "grad_norm": 0.3417886435859495, "learning_rate": 3.0505679736167096e-05, "loss": 0.4665, "step": 1368 }, { "epoch": 1.354104846686449, "grad_norm": 0.3235348071287897, "learning_rate": 3.0487358006595822e-05, "loss": 0.4226, "step": 1369 }, { "epoch": 1.3550939663699308, "grad_norm": 0.3159309730114347, "learning_rate": 3.0469036277024556e-05, "loss": 0.4466, "step": 1370 }, { "epoch": 1.3560830860534125, "grad_norm": 0.324193279571812, "learning_rate": 3.045071454745328e-05, "loss": 0.4501, "step": 1371 }, { "epoch": 1.3570722057368942, "grad_norm": 0.3071795678407246, "learning_rate": 3.0432392817882012e-05, "loss": 0.4487, "step": 1372 }, { "epoch": 1.358061325420376, "grad_norm": 0.30058020510609845, "learning_rate": 3.0414071088310735e-05, "loss": 0.4561, "step": 1373 }, { "epoch": 1.3590504451038576, "grad_norm": 0.3505053014597225, "learning_rate": 3.0395749358739468e-05, "loss": 0.4025, "step": 1374 }, { "epoch": 1.3600395647873393, "grad_norm": 0.3111111301895231, "learning_rate": 3.037742762916819e-05, "loss": 0.4098, "step": 1375 }, { "epoch": 1.361028684470821, "grad_norm": 0.2601225263616035, "learning_rate": 3.0359105899596924e-05, "loss": 0.4243, "step": 1376 }, { "epoch": 1.3620178041543027, "grad_norm": 0.316696497556371, "learning_rate": 3.0340784170025654e-05, "loss": 0.3969, "step": 1377 }, { "epoch": 1.3630069238377844, "grad_norm": 0.34847389358503794, "learning_rate": 3.032246244045438e-05, "loss": 0.4551, "step": 1378 }, { "epoch": 1.3639960435212661, "grad_norm": 0.2902604562577088, "learning_rate": 3.030414071088311e-05, "loss": 0.407, "step": 1379 }, { "epoch": 1.3649851632047478, "grad_norm": 0.27781817505418216, "learning_rate": 3.0285818981311837e-05, "loss": 0.4328, "step": 1380 }, { "epoch": 1.3659742828882295, "grad_norm": 5.942774002381965, "learning_rate": 3.0267497251740567e-05, "loss": 0.4695, "step": 1381 }, { "epoch": 1.3669634025717112, "grad_norm": 0.38926658156561217, "learning_rate": 3.0249175522169293e-05, "loss": 0.4174, "step": 1382 }, { "epoch": 1.367952522255193, "grad_norm": 0.4415126867453672, "learning_rate": 3.0230853792598023e-05, "loss": 0.4475, "step": 1383 }, { "epoch": 1.3689416419386746, "grad_norm": 0.3120038844745485, "learning_rate": 3.021253206302675e-05, "loss": 0.469, "step": 1384 }, { "epoch": 1.3699307616221563, "grad_norm": 0.41969819981367795, "learning_rate": 3.019421033345548e-05, "loss": 0.4923, "step": 1385 }, { "epoch": 1.370919881305638, "grad_norm": 0.38256379980543986, "learning_rate": 3.0175888603884205e-05, "loss": 0.4336, "step": 1386 }, { "epoch": 1.3719090009891197, "grad_norm": 0.3767270435686749, "learning_rate": 3.0157566874312935e-05, "loss": 0.4539, "step": 1387 }, { "epoch": 1.3728981206726014, "grad_norm": 0.3157034106065137, "learning_rate": 3.013924514474166e-05, "loss": 0.445, "step": 1388 }, { "epoch": 1.3738872403560831, "grad_norm": 0.36045163968750676, "learning_rate": 3.0120923415170395e-05, "loss": 0.4121, "step": 1389 }, { "epoch": 1.3748763600395648, "grad_norm": 0.31479023453240734, "learning_rate": 3.0102601685599125e-05, "loss": 0.4216, "step": 1390 }, { "epoch": 1.3758654797230465, "grad_norm": 0.2997878080503291, "learning_rate": 3.008427995602785e-05, "loss": 0.3644, "step": 1391 }, { "epoch": 1.3768545994065282, "grad_norm": 0.3320290130778573, "learning_rate": 3.006595822645658e-05, "loss": 0.4513, "step": 1392 }, { "epoch": 1.37784371909001, "grad_norm": 0.29477176122877125, "learning_rate": 3.0047636496885307e-05, "loss": 0.4601, "step": 1393 }, { "epoch": 1.3788328387734916, "grad_norm": 0.3412218077170071, "learning_rate": 3.0029314767314037e-05, "loss": 0.4154, "step": 1394 }, { "epoch": 1.3798219584569733, "grad_norm": 0.3275380780138223, "learning_rate": 3.0010993037742764e-05, "loss": 0.4176, "step": 1395 }, { "epoch": 1.380811078140455, "grad_norm": 0.2837253095050283, "learning_rate": 2.9992671308171493e-05, "loss": 0.4187, "step": 1396 }, { "epoch": 1.3818001978239367, "grad_norm": 0.4176886469982826, "learning_rate": 2.997434957860022e-05, "loss": 0.4724, "step": 1397 }, { "epoch": 1.3827893175074184, "grad_norm": 0.31073598352099785, "learning_rate": 2.995602784902895e-05, "loss": 0.4342, "step": 1398 }, { "epoch": 1.3837784371909, "grad_norm": 0.3543815361213989, "learning_rate": 2.9937706119457676e-05, "loss": 0.5104, "step": 1399 }, { "epoch": 1.3847675568743818, "grad_norm": 0.33664475436986274, "learning_rate": 2.9919384389886406e-05, "loss": 0.4302, "step": 1400 }, { "epoch": 1.3857566765578635, "grad_norm": 0.36124601508500437, "learning_rate": 2.990106266031514e-05, "loss": 0.4512, "step": 1401 }, { "epoch": 1.3867457962413452, "grad_norm": 0.35574503741610786, "learning_rate": 2.9882740930743862e-05, "loss": 0.4337, "step": 1402 }, { "epoch": 1.387734915924827, "grad_norm": 0.36123605275941045, "learning_rate": 2.9864419201172595e-05, "loss": 0.4312, "step": 1403 }, { "epoch": 1.3887240356083086, "grad_norm": 0.3363751995869784, "learning_rate": 2.9846097471601318e-05, "loss": 0.4351, "step": 1404 }, { "epoch": 1.3897131552917903, "grad_norm": 0.34547691242406064, "learning_rate": 2.982777574203005e-05, "loss": 0.4442, "step": 1405 }, { "epoch": 1.390702274975272, "grad_norm": 0.364044074816971, "learning_rate": 2.9809454012458778e-05, "loss": 0.4708, "step": 1406 }, { "epoch": 1.3916913946587537, "grad_norm": 0.3049021711534288, "learning_rate": 2.9791132282887508e-05, "loss": 0.477, "step": 1407 }, { "epoch": 1.3926805143422354, "grad_norm": 0.38211284345526886, "learning_rate": 2.9772810553316234e-05, "loss": 0.43, "step": 1408 }, { "epoch": 1.393669634025717, "grad_norm": 0.41386742466147336, "learning_rate": 2.9754488823744964e-05, "loss": 0.4561, "step": 1409 }, { "epoch": 1.3946587537091988, "grad_norm": 0.3386310090552587, "learning_rate": 2.973616709417369e-05, "loss": 0.4387, "step": 1410 }, { "epoch": 1.3956478733926805, "grad_norm": 0.3687161941753462, "learning_rate": 2.971784536460242e-05, "loss": 0.4446, "step": 1411 }, { "epoch": 1.3966369930761622, "grad_norm": 0.3763623220861594, "learning_rate": 2.9699523635031147e-05, "loss": 0.4598, "step": 1412 }, { "epoch": 1.3976261127596439, "grad_norm": 0.3938022952660516, "learning_rate": 2.9681201905459876e-05, "loss": 0.3538, "step": 1413 }, { "epoch": 1.3986152324431256, "grad_norm": 0.36478061032823716, "learning_rate": 2.9662880175888606e-05, "loss": 0.4526, "step": 1414 }, { "epoch": 1.3996043521266073, "grad_norm": 0.35983777332657113, "learning_rate": 2.9644558446317333e-05, "loss": 0.3961, "step": 1415 }, { "epoch": 1.400593471810089, "grad_norm": 0.4361337967780688, "learning_rate": 2.9626236716746066e-05, "loss": 0.46, "step": 1416 }, { "epoch": 1.4015825914935707, "grad_norm": 0.31864426898365455, "learning_rate": 2.960791498717479e-05, "loss": 0.4578, "step": 1417 }, { "epoch": 1.4025717111770524, "grad_norm": 4.370523949146226, "learning_rate": 2.9589593257603522e-05, "loss": 0.5349, "step": 1418 }, { "epoch": 1.403560830860534, "grad_norm": 0.7799236801274586, "learning_rate": 2.9571271528032245e-05, "loss": 0.4095, "step": 1419 }, { "epoch": 1.4045499505440158, "grad_norm": 0.36466940872257664, "learning_rate": 2.9552949798460978e-05, "loss": 0.3791, "step": 1420 }, { "epoch": 1.4055390702274975, "grad_norm": 0.5175092771422681, "learning_rate": 2.95346280688897e-05, "loss": 0.399, "step": 1421 }, { "epoch": 1.4065281899109792, "grad_norm": 1.7636557470614664, "learning_rate": 2.9516306339318434e-05, "loss": 0.4801, "step": 1422 }, { "epoch": 1.4075173095944609, "grad_norm": 0.4215106804422865, "learning_rate": 2.9497984609747157e-05, "loss": 0.4709, "step": 1423 }, { "epoch": 1.4085064292779426, "grad_norm": 0.4032698691829994, "learning_rate": 2.947966288017589e-05, "loss": 0.4624, "step": 1424 }, { "epoch": 1.4094955489614243, "grad_norm": 0.4050242307208399, "learning_rate": 2.946134115060462e-05, "loss": 0.5015, "step": 1425 }, { "epoch": 1.410484668644906, "grad_norm": 0.31172007206763763, "learning_rate": 2.9443019421033347e-05, "loss": 0.442, "step": 1426 }, { "epoch": 1.4114737883283877, "grad_norm": 0.36677022548571625, "learning_rate": 2.9424697691462077e-05, "loss": 0.4427, "step": 1427 }, { "epoch": 1.4124629080118694, "grad_norm": 0.3546662288262847, "learning_rate": 2.9406375961890803e-05, "loss": 0.4413, "step": 1428 }, { "epoch": 1.413452027695351, "grad_norm": 0.36917339334462623, "learning_rate": 2.9388054232319533e-05, "loss": 0.4743, "step": 1429 }, { "epoch": 1.4144411473788328, "grad_norm": 0.3675002506888929, "learning_rate": 2.936973250274826e-05, "loss": 0.46, "step": 1430 }, { "epoch": 1.4154302670623147, "grad_norm": 0.28465523267849413, "learning_rate": 2.935141077317699e-05, "loss": 0.418, "step": 1431 }, { "epoch": 1.4164193867457961, "grad_norm": 0.4170643740008079, "learning_rate": 2.9333089043605716e-05, "loss": 0.4097, "step": 1432 }, { "epoch": 1.417408506429278, "grad_norm": 0.3046886523834855, "learning_rate": 2.9314767314034445e-05, "loss": 0.4429, "step": 1433 }, { "epoch": 1.4183976261127595, "grad_norm": 0.40000915818868443, "learning_rate": 2.9296445584463172e-05, "loss": 0.4261, "step": 1434 }, { "epoch": 1.4193867457962415, "grad_norm": 0.38918672454921527, "learning_rate": 2.9278123854891905e-05, "loss": 0.4585, "step": 1435 }, { "epoch": 1.420375865479723, "grad_norm": 0.29249532179463994, "learning_rate": 2.9259802125320628e-05, "loss": 0.3921, "step": 1436 }, { "epoch": 1.4213649851632049, "grad_norm": 0.3243674617976441, "learning_rate": 2.924148039574936e-05, "loss": 0.511, "step": 1437 }, { "epoch": 1.4223541048466863, "grad_norm": 0.31778105502346576, "learning_rate": 2.922315866617809e-05, "loss": 0.4576, "step": 1438 }, { "epoch": 1.4233432245301683, "grad_norm": 0.3062041751866015, "learning_rate": 2.9204836936606817e-05, "loss": 0.4189, "step": 1439 }, { "epoch": 1.4243323442136497, "grad_norm": 0.309264172194014, "learning_rate": 2.9186515207035547e-05, "loss": 0.4606, "step": 1440 }, { "epoch": 1.4253214638971317, "grad_norm": 0.2801605149580132, "learning_rate": 2.9168193477464274e-05, "loss": 0.3993, "step": 1441 }, { "epoch": 1.4263105835806131, "grad_norm": 0.30985960538013585, "learning_rate": 2.9149871747893003e-05, "loss": 0.4407, "step": 1442 }, { "epoch": 1.427299703264095, "grad_norm": 0.3388775836866452, "learning_rate": 2.913155001832173e-05, "loss": 0.4429, "step": 1443 }, { "epoch": 1.4282888229475765, "grad_norm": 0.2889630554202242, "learning_rate": 2.911322828875046e-05, "loss": 0.432, "step": 1444 }, { "epoch": 1.4292779426310585, "grad_norm": 0.315605316728071, "learning_rate": 2.9094906559179186e-05, "loss": 0.4516, "step": 1445 }, { "epoch": 1.43026706231454, "grad_norm": 0.3457337439226024, "learning_rate": 2.9076584829607916e-05, "loss": 0.4674, "step": 1446 }, { "epoch": 1.4312561819980218, "grad_norm": 0.2952032263895049, "learning_rate": 2.9058263100036642e-05, "loss": 0.4521, "step": 1447 }, { "epoch": 1.4322453016815033, "grad_norm": 0.28620576457716407, "learning_rate": 2.9039941370465372e-05, "loss": 0.4621, "step": 1448 }, { "epoch": 1.4332344213649852, "grad_norm": 0.36744980164748403, "learning_rate": 2.9021619640894105e-05, "loss": 0.4888, "step": 1449 }, { "epoch": 1.434223541048467, "grad_norm": 0.339812788180591, "learning_rate": 2.900329791132283e-05, "loss": 0.5289, "step": 1450 }, { "epoch": 1.4352126607319486, "grad_norm": 0.31566827118837226, "learning_rate": 2.898497618175156e-05, "loss": 0.4492, "step": 1451 }, { "epoch": 1.4362017804154303, "grad_norm": 0.31330393196868006, "learning_rate": 2.8966654452180288e-05, "loss": 0.3981, "step": 1452 }, { "epoch": 1.437190900098912, "grad_norm": 0.3067900583902443, "learning_rate": 2.8948332722609018e-05, "loss": 0.4488, "step": 1453 }, { "epoch": 1.4381800197823937, "grad_norm": 0.2968129574510213, "learning_rate": 2.8930010993037744e-05, "loss": 0.447, "step": 1454 }, { "epoch": 1.4391691394658754, "grad_norm": 0.2864682872837869, "learning_rate": 2.8911689263466474e-05, "loss": 0.4034, "step": 1455 }, { "epoch": 1.4401582591493571, "grad_norm": 0.6880039761239137, "learning_rate": 2.88933675338952e-05, "loss": 0.4305, "step": 1456 }, { "epoch": 1.4411473788328388, "grad_norm": 0.27383935188184777, "learning_rate": 2.887504580432393e-05, "loss": 0.4472, "step": 1457 }, { "epoch": 1.4421364985163205, "grad_norm": 0.29593151735190626, "learning_rate": 2.8856724074752657e-05, "loss": 0.4261, "step": 1458 }, { "epoch": 1.4431256181998022, "grad_norm": 0.28273928682471794, "learning_rate": 2.8838402345181386e-05, "loss": 0.4034, "step": 1459 }, { "epoch": 1.444114737883284, "grad_norm": 0.3165369665953018, "learning_rate": 2.8820080615610116e-05, "loss": 0.4266, "step": 1460 }, { "epoch": 1.4451038575667656, "grad_norm": 0.3012146535540838, "learning_rate": 2.8801758886038843e-05, "loss": 0.4315, "step": 1461 }, { "epoch": 1.4460929772502473, "grad_norm": 0.2711981424629738, "learning_rate": 2.8783437156467576e-05, "loss": 0.4184, "step": 1462 }, { "epoch": 1.447082096933729, "grad_norm": 0.31002008293801026, "learning_rate": 2.87651154268963e-05, "loss": 0.4222, "step": 1463 }, { "epoch": 1.4480712166172107, "grad_norm": 0.3539283902685812, "learning_rate": 2.8746793697325032e-05, "loss": 0.4322, "step": 1464 }, { "epoch": 1.4490603363006924, "grad_norm": 0.3197987270904071, "learning_rate": 2.8728471967753755e-05, "loss": 0.4626, "step": 1465 }, { "epoch": 1.4500494559841741, "grad_norm": 0.294396776618464, "learning_rate": 2.8710150238182488e-05, "loss": 0.4294, "step": 1466 }, { "epoch": 1.4510385756676558, "grad_norm": 0.31250638103474127, "learning_rate": 2.869182850861121e-05, "loss": 0.491, "step": 1467 }, { "epoch": 1.4520276953511375, "grad_norm": 0.29465729172509825, "learning_rate": 2.8673506779039945e-05, "loss": 0.4182, "step": 1468 }, { "epoch": 1.4530168150346192, "grad_norm": 0.27683736051729835, "learning_rate": 2.8655185049468668e-05, "loss": 0.4176, "step": 1469 }, { "epoch": 1.454005934718101, "grad_norm": 0.3763143498743748, "learning_rate": 2.86368633198974e-05, "loss": 0.4864, "step": 1470 }, { "epoch": 1.4549950544015826, "grad_norm": 0.2951254080913707, "learning_rate": 2.8618541590326127e-05, "loss": 0.4084, "step": 1471 }, { "epoch": 1.4559841740850643, "grad_norm": 0.2874094634392024, "learning_rate": 2.8600219860754857e-05, "loss": 0.4525, "step": 1472 }, { "epoch": 1.456973293768546, "grad_norm": 0.34781596007553534, "learning_rate": 2.8581898131183587e-05, "loss": 0.4446, "step": 1473 }, { "epoch": 1.4579624134520277, "grad_norm": 0.3118284461430052, "learning_rate": 2.8563576401612313e-05, "loss": 0.4084, "step": 1474 }, { "epoch": 1.4589515331355094, "grad_norm": 0.3162361256418991, "learning_rate": 2.8545254672041043e-05, "loss": 0.4418, "step": 1475 }, { "epoch": 1.459940652818991, "grad_norm": 0.38053633547795723, "learning_rate": 2.852693294246977e-05, "loss": 0.3583, "step": 1476 }, { "epoch": 1.4609297725024728, "grad_norm": 0.30505081841976306, "learning_rate": 2.85086112128985e-05, "loss": 0.4295, "step": 1477 }, { "epoch": 1.4619188921859545, "grad_norm": 0.3700760611954873, "learning_rate": 2.8490289483327226e-05, "loss": 0.4091, "step": 1478 }, { "epoch": 1.4629080118694362, "grad_norm": 0.29239279326719575, "learning_rate": 2.8471967753755955e-05, "loss": 0.4535, "step": 1479 }, { "epoch": 1.463897131552918, "grad_norm": 0.3096224600613784, "learning_rate": 2.8453646024184682e-05, "loss": 0.4517, "step": 1480 }, { "epoch": 1.4648862512363996, "grad_norm": 0.29982859930862565, "learning_rate": 2.8435324294613415e-05, "loss": 0.3935, "step": 1481 }, { "epoch": 1.4658753709198813, "grad_norm": 0.2972910923176461, "learning_rate": 2.8417002565042138e-05, "loss": 0.4544, "step": 1482 }, { "epoch": 1.466864490603363, "grad_norm": 0.2926067116253536, "learning_rate": 2.839868083547087e-05, "loss": 0.415, "step": 1483 }, { "epoch": 1.4678536102868447, "grad_norm": 0.3328585626789144, "learning_rate": 2.83803591058996e-05, "loss": 0.4579, "step": 1484 }, { "epoch": 1.4688427299703264, "grad_norm": 0.3481307578459911, "learning_rate": 2.8362037376328327e-05, "loss": 0.4605, "step": 1485 }, { "epoch": 1.469831849653808, "grad_norm": 0.31652707929635426, "learning_rate": 2.8343715646757057e-05, "loss": 0.4656, "step": 1486 }, { "epoch": 1.4708209693372898, "grad_norm": 0.29953398282105115, "learning_rate": 2.8325393917185784e-05, "loss": 0.3825, "step": 1487 }, { "epoch": 1.4718100890207715, "grad_norm": 0.3097931361873477, "learning_rate": 2.8307072187614514e-05, "loss": 0.3972, "step": 1488 }, { "epoch": 1.4727992087042532, "grad_norm": 0.33071430642317845, "learning_rate": 2.828875045804324e-05, "loss": 0.4285, "step": 1489 }, { "epoch": 1.4737883283877349, "grad_norm": 0.2859188800768853, "learning_rate": 2.827042872847197e-05, "loss": 0.418, "step": 1490 }, { "epoch": 1.4747774480712166, "grad_norm": 0.2881058247271708, "learning_rate": 2.8252106998900696e-05, "loss": 0.4643, "step": 1491 }, { "epoch": 1.4757665677546983, "grad_norm": 0.38974799158353357, "learning_rate": 2.8233785269329426e-05, "loss": 0.4169, "step": 1492 }, { "epoch": 1.47675568743818, "grad_norm": 0.3275131922732535, "learning_rate": 2.8215463539758152e-05, "loss": 0.446, "step": 1493 }, { "epoch": 1.4777448071216617, "grad_norm": 0.3299066979177711, "learning_rate": 2.8197141810186882e-05, "loss": 0.4617, "step": 1494 }, { "epoch": 1.4787339268051434, "grad_norm": 0.364073130150921, "learning_rate": 2.817882008061561e-05, "loss": 0.5335, "step": 1495 }, { "epoch": 1.479723046488625, "grad_norm": 0.3277284148931501, "learning_rate": 2.816049835104434e-05, "loss": 0.4456, "step": 1496 }, { "epoch": 1.4807121661721068, "grad_norm": 0.43802618542564276, "learning_rate": 2.814217662147307e-05, "loss": 0.4724, "step": 1497 }, { "epoch": 1.4817012858555885, "grad_norm": 0.2765090527927701, "learning_rate": 2.8123854891901798e-05, "loss": 0.3961, "step": 1498 }, { "epoch": 1.4826904055390702, "grad_norm": 0.3416485689233916, "learning_rate": 2.8105533162330528e-05, "loss": 0.4166, "step": 1499 }, { "epoch": 1.4836795252225519, "grad_norm": 0.37839623878952566, "learning_rate": 2.8087211432759254e-05, "loss": 0.4906, "step": 1500 }, { "epoch": 1.4846686449060336, "grad_norm": 0.3518638957400736, "learning_rate": 2.8068889703187984e-05, "loss": 0.4069, "step": 1501 }, { "epoch": 1.4856577645895153, "grad_norm": 0.33111446442715076, "learning_rate": 2.805056797361671e-05, "loss": 0.4293, "step": 1502 }, { "epoch": 1.486646884272997, "grad_norm": 0.295453610129648, "learning_rate": 2.803224624404544e-05, "loss": 0.4292, "step": 1503 }, { "epoch": 1.4876360039564787, "grad_norm": 0.2756874683521067, "learning_rate": 2.8013924514474167e-05, "loss": 0.4104, "step": 1504 }, { "epoch": 1.4886251236399604, "grad_norm": 0.3798280905192116, "learning_rate": 2.7995602784902896e-05, "loss": 0.4483, "step": 1505 }, { "epoch": 1.489614243323442, "grad_norm": 0.3414752636312388, "learning_rate": 2.7977281055331623e-05, "loss": 0.4637, "step": 1506 }, { "epoch": 1.4906033630069238, "grad_norm": 0.35252371421106593, "learning_rate": 2.7958959325760353e-05, "loss": 0.4697, "step": 1507 }, { "epoch": 1.4915924826904057, "grad_norm": 0.35553900870734595, "learning_rate": 2.7940637596189086e-05, "loss": 0.4427, "step": 1508 }, { "epoch": 1.4925816023738872, "grad_norm": 0.27479809910211267, "learning_rate": 2.792231586661781e-05, "loss": 0.4329, "step": 1509 }, { "epoch": 1.493570722057369, "grad_norm": 0.3169137202038055, "learning_rate": 2.7903994137046542e-05, "loss": 0.4504, "step": 1510 }, { "epoch": 1.4945598417408505, "grad_norm": 0.3611773054393912, "learning_rate": 2.7885672407475265e-05, "loss": 0.3944, "step": 1511 }, { "epoch": 1.4955489614243325, "grad_norm": 0.27848717086943314, "learning_rate": 2.7867350677904e-05, "loss": 0.4297, "step": 1512 }, { "epoch": 1.496538081107814, "grad_norm": 0.2890433618635751, "learning_rate": 2.784902894833272e-05, "loss": 0.4256, "step": 1513 }, { "epoch": 1.4975272007912959, "grad_norm": 0.29233252685415784, "learning_rate": 2.7830707218761455e-05, "loss": 0.5019, "step": 1514 }, { "epoch": 1.4985163204747773, "grad_norm": 0.2808355518258454, "learning_rate": 2.7812385489190178e-05, "loss": 0.4533, "step": 1515 }, { "epoch": 1.4995054401582593, "grad_norm": 0.26776681113551326, "learning_rate": 2.779406375961891e-05, "loss": 0.4154, "step": 1516 }, { "epoch": 1.5004945598417407, "grad_norm": 0.318650708286605, "learning_rate": 2.7775742030047637e-05, "loss": 0.4203, "step": 1517 }, { "epoch": 1.5014836795252227, "grad_norm": 0.26460015417559507, "learning_rate": 2.7757420300476367e-05, "loss": 0.4147, "step": 1518 }, { "epoch": 1.5024727992087041, "grad_norm": 0.25425666410240005, "learning_rate": 2.7739098570905093e-05, "loss": 0.3944, "step": 1519 }, { "epoch": 1.503461918892186, "grad_norm": 0.35103080270975756, "learning_rate": 2.7720776841333823e-05, "loss": 0.417, "step": 1520 }, { "epoch": 1.5044510385756675, "grad_norm": 0.31288047385282897, "learning_rate": 2.7702455111762553e-05, "loss": 0.4376, "step": 1521 }, { "epoch": 1.5054401582591495, "grad_norm": 0.3026304301870927, "learning_rate": 2.768413338219128e-05, "loss": 0.4141, "step": 1522 }, { "epoch": 1.506429277942631, "grad_norm": 0.29749849928337113, "learning_rate": 2.766581165262001e-05, "loss": 0.4098, "step": 1523 }, { "epoch": 1.5074183976261128, "grad_norm": 0.29038320359051056, "learning_rate": 2.7647489923048736e-05, "loss": 0.4431, "step": 1524 }, { "epoch": 1.5084075173095943, "grad_norm": 0.2907976228534865, "learning_rate": 2.7629168193477466e-05, "loss": 0.4574, "step": 1525 }, { "epoch": 1.5093966369930762, "grad_norm": 0.25183312695543336, "learning_rate": 2.7610846463906192e-05, "loss": 0.3744, "step": 1526 }, { "epoch": 1.5103857566765577, "grad_norm": 0.2855912969143611, "learning_rate": 2.7592524734334925e-05, "loss": 0.4661, "step": 1527 }, { "epoch": 1.5113748763600396, "grad_norm": 0.2732293198137018, "learning_rate": 2.7574203004763648e-05, "loss": 0.4355, "step": 1528 }, { "epoch": 1.5123639960435211, "grad_norm": 0.3213776545316619, "learning_rate": 2.755588127519238e-05, "loss": 0.455, "step": 1529 }, { "epoch": 1.513353115727003, "grad_norm": 0.29145234783713037, "learning_rate": 2.7537559545621104e-05, "loss": 0.4878, "step": 1530 }, { "epoch": 1.5143422354104845, "grad_norm": 0.266071594136441, "learning_rate": 2.7519237816049838e-05, "loss": 0.4167, "step": 1531 }, { "epoch": 1.5153313550939664, "grad_norm": 0.2731003398932023, "learning_rate": 2.7500916086478567e-05, "loss": 0.4768, "step": 1532 }, { "epoch": 1.516320474777448, "grad_norm": 0.3097341831479785, "learning_rate": 2.7482594356907294e-05, "loss": 0.4474, "step": 1533 }, { "epoch": 1.5173095944609298, "grad_norm": 0.34027538016329345, "learning_rate": 2.7464272627336024e-05, "loss": 0.4393, "step": 1534 }, { "epoch": 1.5182987141444113, "grad_norm": 1.2356940989603606, "learning_rate": 2.744595089776475e-05, "loss": 0.4578, "step": 1535 }, { "epoch": 1.5192878338278932, "grad_norm": 0.5459791028027153, "learning_rate": 2.742762916819348e-05, "loss": 0.4724, "step": 1536 }, { "epoch": 1.520276953511375, "grad_norm": 0.31844571348235073, "learning_rate": 2.7409307438622206e-05, "loss": 0.4125, "step": 1537 }, { "epoch": 1.5212660731948566, "grad_norm": 0.31015078316170824, "learning_rate": 2.7390985709050936e-05, "loss": 0.433, "step": 1538 }, { "epoch": 1.5222551928783383, "grad_norm": 0.29396592937434984, "learning_rate": 2.7372663979479662e-05, "loss": 0.4376, "step": 1539 }, { "epoch": 1.52324431256182, "grad_norm": 0.2911564550284436, "learning_rate": 2.7354342249908392e-05, "loss": 0.4112, "step": 1540 }, { "epoch": 1.5242334322453017, "grad_norm": 0.3141108762736842, "learning_rate": 2.733602052033712e-05, "loss": 0.4333, "step": 1541 }, { "epoch": 1.5252225519287834, "grad_norm": 0.28602791405930644, "learning_rate": 2.731769879076585e-05, "loss": 0.4535, "step": 1542 }, { "epoch": 1.5262116716122651, "grad_norm": 0.2559604367618264, "learning_rate": 2.7299377061194575e-05, "loss": 0.4126, "step": 1543 }, { "epoch": 1.5272007912957468, "grad_norm": 0.27275667409056814, "learning_rate": 2.7281055331623305e-05, "loss": 0.3398, "step": 1544 }, { "epoch": 1.5281899109792285, "grad_norm": 0.27888080177130276, "learning_rate": 2.7262733602052038e-05, "loss": 0.396, "step": 1545 }, { "epoch": 1.5291790306627102, "grad_norm": 0.29457676257202875, "learning_rate": 2.7244411872480764e-05, "loss": 0.4142, "step": 1546 }, { "epoch": 1.530168150346192, "grad_norm": 0.2657669136258467, "learning_rate": 2.7226090142909494e-05, "loss": 0.3811, "step": 1547 }, { "epoch": 1.5311572700296736, "grad_norm": 0.2787061006660649, "learning_rate": 2.720776841333822e-05, "loss": 0.3936, "step": 1548 }, { "epoch": 1.5321463897131553, "grad_norm": 0.37452021826356185, "learning_rate": 2.718944668376695e-05, "loss": 0.4526, "step": 1549 }, { "epoch": 1.533135509396637, "grad_norm": 0.24566974329827732, "learning_rate": 2.7171124954195677e-05, "loss": 0.3751, "step": 1550 }, { "epoch": 1.5341246290801187, "grad_norm": 0.2757225886209204, "learning_rate": 2.7152803224624407e-05, "loss": 0.466, "step": 1551 }, { "epoch": 1.5351137487636004, "grad_norm": 0.360202782237012, "learning_rate": 2.7134481495053133e-05, "loss": 0.5028, "step": 1552 }, { "epoch": 1.536102868447082, "grad_norm": 0.28217047010151747, "learning_rate": 2.7116159765481863e-05, "loss": 0.4216, "step": 1553 }, { "epoch": 1.5370919881305638, "grad_norm": 0.3369657062476821, "learning_rate": 2.709783803591059e-05, "loss": 0.4474, "step": 1554 }, { "epoch": 1.5380811078140455, "grad_norm": 0.32643487155724593, "learning_rate": 2.707951630633932e-05, "loss": 0.4275, "step": 1555 }, { "epoch": 1.5390702274975272, "grad_norm": 0.2817779215324222, "learning_rate": 2.7061194576768052e-05, "loss": 0.4371, "step": 1556 }, { "epoch": 1.540059347181009, "grad_norm": 0.3063323029729112, "learning_rate": 2.7042872847196775e-05, "loss": 0.4084, "step": 1557 }, { "epoch": 1.5410484668644906, "grad_norm": 0.26570368053936894, "learning_rate": 2.702455111762551e-05, "loss": 0.4377, "step": 1558 }, { "epoch": 1.5420375865479723, "grad_norm": 0.28478093201712384, "learning_rate": 2.700622938805423e-05, "loss": 0.3711, "step": 1559 }, { "epoch": 1.543026706231454, "grad_norm": 0.3527584738458635, "learning_rate": 2.6987907658482965e-05, "loss": 0.4467, "step": 1560 }, { "epoch": 1.5440158259149357, "grad_norm": 0.2998161804452798, "learning_rate": 2.6969585928911688e-05, "loss": 0.4022, "step": 1561 }, { "epoch": 1.5450049455984174, "grad_norm": 0.30287586677925865, "learning_rate": 2.695126419934042e-05, "loss": 0.4276, "step": 1562 }, { "epoch": 1.545994065281899, "grad_norm": 0.29066288076793795, "learning_rate": 2.6932942469769147e-05, "loss": 0.4298, "step": 1563 }, { "epoch": 1.5469831849653808, "grad_norm": 0.36634590156409147, "learning_rate": 2.6914620740197877e-05, "loss": 0.4333, "step": 1564 }, { "epoch": 1.5479723046488625, "grad_norm": 0.3516684553628714, "learning_rate": 2.6896299010626604e-05, "loss": 0.474, "step": 1565 }, { "epoch": 1.5489614243323442, "grad_norm": 0.3315473412552291, "learning_rate": 2.6877977281055333e-05, "loss": 0.4199, "step": 1566 }, { "epoch": 1.5499505440158259, "grad_norm": 0.304443458898562, "learning_rate": 2.685965555148406e-05, "loss": 0.4685, "step": 1567 }, { "epoch": 1.5509396636993076, "grad_norm": 0.34939766241844106, "learning_rate": 2.684133382191279e-05, "loss": 0.3867, "step": 1568 }, { "epoch": 1.5519287833827893, "grad_norm": 0.3364698970448001, "learning_rate": 2.682301209234152e-05, "loss": 0.4314, "step": 1569 }, { "epoch": 1.552917903066271, "grad_norm": 0.34050583019646397, "learning_rate": 2.6804690362770246e-05, "loss": 0.4741, "step": 1570 }, { "epoch": 1.5539070227497527, "grad_norm": 0.26237037768227367, "learning_rate": 2.6786368633198976e-05, "loss": 0.4107, "step": 1571 }, { "epoch": 1.5548961424332344, "grad_norm": 0.3461301127225192, "learning_rate": 2.6768046903627702e-05, "loss": 0.4541, "step": 1572 }, { "epoch": 1.555885262116716, "grad_norm": 0.2511422428233448, "learning_rate": 2.6749725174056435e-05, "loss": 0.4286, "step": 1573 }, { "epoch": 1.5568743818001978, "grad_norm": 0.6854382606786477, "learning_rate": 2.6731403444485158e-05, "loss": 0.5168, "step": 1574 }, { "epoch": 1.5578635014836797, "grad_norm": 0.29587534482934763, "learning_rate": 2.671308171491389e-05, "loss": 0.4379, "step": 1575 }, { "epoch": 1.5588526211671612, "grad_norm": 0.30965230188237924, "learning_rate": 2.6694759985342614e-05, "loss": 0.4372, "step": 1576 }, { "epoch": 1.559841740850643, "grad_norm": 0.29630073934251444, "learning_rate": 2.6676438255771348e-05, "loss": 0.4082, "step": 1577 }, { "epoch": 1.5608308605341246, "grad_norm": 0.3254718398772897, "learning_rate": 2.665811652620007e-05, "loss": 0.446, "step": 1578 }, { "epoch": 1.5618199802176065, "grad_norm": 0.35742974851957743, "learning_rate": 2.6639794796628804e-05, "loss": 0.4064, "step": 1579 }, { "epoch": 1.562809099901088, "grad_norm": 0.297566684364922, "learning_rate": 2.6621473067057534e-05, "loss": 0.3941, "step": 1580 }, { "epoch": 1.5637982195845699, "grad_norm": 0.34824595226140137, "learning_rate": 2.660315133748626e-05, "loss": 0.4817, "step": 1581 }, { "epoch": 1.5647873392680514, "grad_norm": 0.37702204793009536, "learning_rate": 2.658482960791499e-05, "loss": 0.4767, "step": 1582 }, { "epoch": 1.5657764589515333, "grad_norm": 0.35461074860740005, "learning_rate": 2.6566507878343716e-05, "loss": 0.4296, "step": 1583 }, { "epoch": 1.5667655786350148, "grad_norm": 0.3200818420727633, "learning_rate": 2.6548186148772446e-05, "loss": 0.381, "step": 1584 }, { "epoch": 1.5677546983184967, "grad_norm": 0.425899730481969, "learning_rate": 2.6529864419201173e-05, "loss": 0.4158, "step": 1585 }, { "epoch": 1.5687438180019782, "grad_norm": 1.1299623445949691, "learning_rate": 2.6511542689629902e-05, "loss": 0.3845, "step": 1586 }, { "epoch": 1.56973293768546, "grad_norm": 0.2782420434679356, "learning_rate": 2.649322096005863e-05, "loss": 0.4361, "step": 1587 }, { "epoch": 1.5707220573689415, "grad_norm": 0.3363493977754832, "learning_rate": 2.647489923048736e-05, "loss": 0.3991, "step": 1588 }, { "epoch": 1.5717111770524235, "grad_norm": 0.3204275823714799, "learning_rate": 2.6456577500916085e-05, "loss": 0.5167, "step": 1589 }, { "epoch": 1.572700296735905, "grad_norm": 0.3366195485123245, "learning_rate": 2.6438255771344815e-05, "loss": 0.4971, "step": 1590 }, { "epoch": 1.5736894164193869, "grad_norm": 0.3067388676397528, "learning_rate": 2.641993404177354e-05, "loss": 0.4899, "step": 1591 }, { "epoch": 1.5746785361028683, "grad_norm": 0.27590309427573345, "learning_rate": 2.6401612312202274e-05, "loss": 0.4055, "step": 1592 }, { "epoch": 1.5756676557863503, "grad_norm": 0.3184897304138686, "learning_rate": 2.6383290582631004e-05, "loss": 0.4245, "step": 1593 }, { "epoch": 1.5766567754698317, "grad_norm": 0.2753393949803328, "learning_rate": 2.636496885305973e-05, "loss": 0.4415, "step": 1594 }, { "epoch": 1.5776458951533137, "grad_norm": 0.2696948403053459, "learning_rate": 2.634664712348846e-05, "loss": 0.4367, "step": 1595 }, { "epoch": 1.5786350148367951, "grad_norm": 0.291084848882316, "learning_rate": 2.6328325393917187e-05, "loss": 0.4221, "step": 1596 }, { "epoch": 1.579624134520277, "grad_norm": 0.29012800052048404, "learning_rate": 2.6310003664345917e-05, "loss": 0.407, "step": 1597 }, { "epoch": 1.5806132542037585, "grad_norm": 0.32837973620046834, "learning_rate": 2.6291681934774643e-05, "loss": 0.4483, "step": 1598 }, { "epoch": 1.5816023738872405, "grad_norm": 0.28011551139081314, "learning_rate": 2.6273360205203373e-05, "loss": 0.4711, "step": 1599 }, { "epoch": 1.582591493570722, "grad_norm": 0.3282890674772606, "learning_rate": 2.62550384756321e-05, "loss": 0.3873, "step": 1600 }, { "epoch": 1.5835806132542039, "grad_norm": 0.33875347479904816, "learning_rate": 2.623671674606083e-05, "loss": 0.4552, "step": 1601 }, { "epoch": 1.5845697329376853, "grad_norm": 0.29670107000989415, "learning_rate": 2.6218395016489556e-05, "loss": 0.4795, "step": 1602 }, { "epoch": 1.5855588526211672, "grad_norm": 0.265294490890646, "learning_rate": 2.6200073286918285e-05, "loss": 0.3973, "step": 1603 }, { "epoch": 1.5865479723046487, "grad_norm": 0.33359772528312437, "learning_rate": 2.618175155734702e-05, "loss": 0.4843, "step": 1604 }, { "epoch": 1.5875370919881306, "grad_norm": 0.4026752678559618, "learning_rate": 2.616342982777574e-05, "loss": 0.4425, "step": 1605 }, { "epoch": 1.5885262116716121, "grad_norm": 0.3058367116238238, "learning_rate": 2.6145108098204475e-05, "loss": 0.4316, "step": 1606 }, { "epoch": 1.589515331355094, "grad_norm": 0.30426608314843634, "learning_rate": 2.6126786368633198e-05, "loss": 0.4927, "step": 1607 }, { "epoch": 1.5905044510385755, "grad_norm": 0.37917329704685165, "learning_rate": 2.610846463906193e-05, "loss": 0.5052, "step": 1608 }, { "epoch": 1.5914935707220574, "grad_norm": 0.33134630981635044, "learning_rate": 2.6090142909490657e-05, "loss": 0.4995, "step": 1609 }, { "epoch": 1.592482690405539, "grad_norm": 0.2991963652688359, "learning_rate": 2.6071821179919387e-05, "loss": 0.3927, "step": 1610 }, { "epoch": 1.5934718100890208, "grad_norm": 0.3570948011063814, "learning_rate": 2.6053499450348114e-05, "loss": 0.4624, "step": 1611 }, { "epoch": 1.5944609297725023, "grad_norm": 0.3199568594829708, "learning_rate": 2.6035177720776843e-05, "loss": 0.4605, "step": 1612 }, { "epoch": 1.5954500494559842, "grad_norm": 0.33280196155936287, "learning_rate": 2.601685599120557e-05, "loss": 0.4746, "step": 1613 }, { "epoch": 1.596439169139466, "grad_norm": 0.39417035896603214, "learning_rate": 2.59985342616343e-05, "loss": 0.4208, "step": 1614 }, { "epoch": 1.5974282888229476, "grad_norm": 0.33066363581898495, "learning_rate": 2.598021253206303e-05, "loss": 0.504, "step": 1615 }, { "epoch": 1.5984174085064293, "grad_norm": 0.3461460215044843, "learning_rate": 2.5961890802491756e-05, "loss": 0.4331, "step": 1616 }, { "epoch": 1.599406528189911, "grad_norm": 0.3652275383465214, "learning_rate": 2.5943569072920486e-05, "loss": 0.4299, "step": 1617 }, { "epoch": 1.6003956478733927, "grad_norm": 0.30120482055467496, "learning_rate": 2.5925247343349212e-05, "loss": 0.4453, "step": 1618 }, { "epoch": 1.6013847675568744, "grad_norm": 0.30615378277352595, "learning_rate": 2.5906925613777945e-05, "loss": 0.4507, "step": 1619 }, { "epoch": 1.6023738872403561, "grad_norm": 0.32283104348568503, "learning_rate": 2.5888603884206668e-05, "loss": 0.3887, "step": 1620 }, { "epoch": 1.6033630069238378, "grad_norm": 0.38745300511964836, "learning_rate": 2.58702821546354e-05, "loss": 0.4475, "step": 1621 }, { "epoch": 1.6043521266073195, "grad_norm": 0.29046077388027175, "learning_rate": 2.5851960425064125e-05, "loss": 0.4147, "step": 1622 }, { "epoch": 1.6053412462908012, "grad_norm": 0.440454936747926, "learning_rate": 2.5833638695492858e-05, "loss": 0.4652, "step": 1623 }, { "epoch": 1.606330365974283, "grad_norm": 0.6507104848117019, "learning_rate": 2.581531696592158e-05, "loss": 0.4032, "step": 1624 }, { "epoch": 1.6073194856577646, "grad_norm": 0.3264866453501732, "learning_rate": 2.5796995236350314e-05, "loss": 0.4031, "step": 1625 }, { "epoch": 1.6083086053412463, "grad_norm": 0.3023992104795454, "learning_rate": 2.5778673506779037e-05, "loss": 0.4203, "step": 1626 }, { "epoch": 1.609297725024728, "grad_norm": 0.359657753419691, "learning_rate": 2.576035177720777e-05, "loss": 0.4624, "step": 1627 }, { "epoch": 1.6102868447082097, "grad_norm": 0.3372351206140866, "learning_rate": 2.57420300476365e-05, "loss": 0.4056, "step": 1628 }, { "epoch": 1.6112759643916914, "grad_norm": 0.30039862972108095, "learning_rate": 2.5723708318065226e-05, "loss": 0.4399, "step": 1629 }, { "epoch": 1.612265084075173, "grad_norm": 0.30065834303511957, "learning_rate": 2.5705386588493956e-05, "loss": 0.4663, "step": 1630 }, { "epoch": 1.6132542037586548, "grad_norm": 0.3410392017721668, "learning_rate": 2.5687064858922683e-05, "loss": 0.4458, "step": 1631 }, { "epoch": 1.6142433234421365, "grad_norm": 0.31764577949936024, "learning_rate": 2.5668743129351412e-05, "loss": 0.4946, "step": 1632 }, { "epoch": 1.6152324431256182, "grad_norm": 0.3139161488552942, "learning_rate": 2.565042139978014e-05, "loss": 0.45, "step": 1633 }, { "epoch": 1.6162215628091, "grad_norm": 0.2838563703036641, "learning_rate": 2.563209967020887e-05, "loss": 0.3615, "step": 1634 }, { "epoch": 1.6172106824925816, "grad_norm": 0.280910691075243, "learning_rate": 2.5613777940637595e-05, "loss": 0.3664, "step": 1635 }, { "epoch": 1.6181998021760633, "grad_norm": 0.3442161215592867, "learning_rate": 2.5595456211066325e-05, "loss": 0.45, "step": 1636 }, { "epoch": 1.619188921859545, "grad_norm": 0.2953185716419831, "learning_rate": 2.557713448149505e-05, "loss": 0.4172, "step": 1637 }, { "epoch": 1.6201780415430267, "grad_norm": 0.2654726745970755, "learning_rate": 2.5558812751923784e-05, "loss": 0.3943, "step": 1638 }, { "epoch": 1.6211671612265084, "grad_norm": 0.3200576472766354, "learning_rate": 2.5540491022352514e-05, "loss": 0.4265, "step": 1639 }, { "epoch": 1.62215628090999, "grad_norm": 0.29628265645606083, "learning_rate": 2.552216929278124e-05, "loss": 0.4116, "step": 1640 }, { "epoch": 1.6231454005934718, "grad_norm": 0.2841844484198089, "learning_rate": 2.550384756320997e-05, "loss": 0.44, "step": 1641 }, { "epoch": 1.6241345202769535, "grad_norm": 0.2557201167123379, "learning_rate": 2.5485525833638697e-05, "loss": 0.3717, "step": 1642 }, { "epoch": 1.6251236399604352, "grad_norm": 0.33250075592922157, "learning_rate": 2.5467204104067427e-05, "loss": 0.4857, "step": 1643 }, { "epoch": 1.6261127596439169, "grad_norm": 0.2800845720099231, "learning_rate": 2.5448882374496153e-05, "loss": 0.3958, "step": 1644 }, { "epoch": 1.6271018793273986, "grad_norm": 0.31456766982731915, "learning_rate": 2.5430560644924883e-05, "loss": 0.4571, "step": 1645 }, { "epoch": 1.6280909990108803, "grad_norm": 0.277187648256086, "learning_rate": 2.541223891535361e-05, "loss": 0.3837, "step": 1646 }, { "epoch": 1.629080118694362, "grad_norm": 0.26160671112794515, "learning_rate": 2.539391718578234e-05, "loss": 0.4197, "step": 1647 }, { "epoch": 1.6300692383778437, "grad_norm": 0.3224446440259182, "learning_rate": 2.5375595456211066e-05, "loss": 0.44, "step": 1648 }, { "epoch": 1.6310583580613254, "grad_norm": 0.2627299707116854, "learning_rate": 2.5357273726639795e-05, "loss": 0.4301, "step": 1649 }, { "epoch": 1.632047477744807, "grad_norm": 0.2959085159607189, "learning_rate": 2.5338951997068522e-05, "loss": 0.4882, "step": 1650 }, { "epoch": 1.6330365974282888, "grad_norm": 0.28662674449197795, "learning_rate": 2.532063026749725e-05, "loss": 0.4183, "step": 1651 }, { "epoch": 1.6340257171117705, "grad_norm": 0.30546067103773544, "learning_rate": 2.5302308537925985e-05, "loss": 0.413, "step": 1652 }, { "epoch": 1.6350148367952522, "grad_norm": 0.276370031240087, "learning_rate": 2.5283986808354708e-05, "loss": 0.4232, "step": 1653 }, { "epoch": 1.636003956478734, "grad_norm": 0.2706115264434037, "learning_rate": 2.526566507878344e-05, "loss": 0.4293, "step": 1654 }, { "epoch": 1.6369930761622156, "grad_norm": 0.30532651247215364, "learning_rate": 2.5247343349212167e-05, "loss": 0.4428, "step": 1655 }, { "epoch": 1.6379821958456975, "grad_norm": 0.3187299875860798, "learning_rate": 2.5229021619640897e-05, "loss": 0.4933, "step": 1656 }, { "epoch": 1.638971315529179, "grad_norm": 0.2719115674229351, "learning_rate": 2.5210699890069624e-05, "loss": 0.3899, "step": 1657 }, { "epoch": 1.6399604352126609, "grad_norm": 0.27197612915344427, "learning_rate": 2.5192378160498353e-05, "loss": 0.3803, "step": 1658 }, { "epoch": 1.6409495548961424, "grad_norm": 0.30000048728325507, "learning_rate": 2.517405643092708e-05, "loss": 0.4025, "step": 1659 }, { "epoch": 1.6419386745796243, "grad_norm": 0.2644120267282455, "learning_rate": 2.515573470135581e-05, "loss": 0.3988, "step": 1660 }, { "epoch": 1.6429277942631058, "grad_norm": 0.2741960980354914, "learning_rate": 2.5137412971784536e-05, "loss": 0.4587, "step": 1661 }, { "epoch": 1.6439169139465877, "grad_norm": 0.2930096159187744, "learning_rate": 2.5119091242213266e-05, "loss": 0.4697, "step": 1662 }, { "epoch": 1.6449060336300692, "grad_norm": 0.35317847881770087, "learning_rate": 2.5100769512641996e-05, "loss": 0.4459, "step": 1663 }, { "epoch": 1.645895153313551, "grad_norm": 0.2870305099438775, "learning_rate": 2.5082447783070722e-05, "loss": 0.4329, "step": 1664 }, { "epoch": 1.6468842729970326, "grad_norm": 0.31469807580481884, "learning_rate": 2.5064126053499455e-05, "loss": 0.4459, "step": 1665 }, { "epoch": 1.6478733926805145, "grad_norm": 0.32969026971793497, "learning_rate": 2.504580432392818e-05, "loss": 0.4279, "step": 1666 }, { "epoch": 1.648862512363996, "grad_norm": 0.331096333283008, "learning_rate": 2.502748259435691e-05, "loss": 0.4519, "step": 1667 }, { "epoch": 1.6498516320474779, "grad_norm": 0.3441582936309551, "learning_rate": 2.5009160864785635e-05, "loss": 0.4669, "step": 1668 }, { "epoch": 1.6508407517309593, "grad_norm": 0.3160027414594968, "learning_rate": 2.4990839135214368e-05, "loss": 0.4374, "step": 1669 }, { "epoch": 1.6518298714144413, "grad_norm": 0.2995831571732668, "learning_rate": 2.4972517405643094e-05, "loss": 0.4105, "step": 1670 }, { "epoch": 1.6528189910979227, "grad_norm": 0.4442058450146274, "learning_rate": 2.4954195676071824e-05, "loss": 0.4731, "step": 1671 }, { "epoch": 1.6538081107814047, "grad_norm": 0.33416266456418664, "learning_rate": 2.493587394650055e-05, "loss": 0.4784, "step": 1672 }, { "epoch": 1.6547972304648861, "grad_norm": 0.3146001816140936, "learning_rate": 2.491755221692928e-05, "loss": 0.4262, "step": 1673 }, { "epoch": 1.655786350148368, "grad_norm": 0.3743726426973562, "learning_rate": 2.4899230487358007e-05, "loss": 0.522, "step": 1674 }, { "epoch": 1.6567754698318495, "grad_norm": 0.37599411126000926, "learning_rate": 2.4880908757786736e-05, "loss": 0.4539, "step": 1675 }, { "epoch": 1.6577645895153315, "grad_norm": 0.2971196338181042, "learning_rate": 2.4862587028215463e-05, "loss": 0.4425, "step": 1676 }, { "epoch": 1.658753709198813, "grad_norm": 0.3185181610361161, "learning_rate": 2.4844265298644193e-05, "loss": 0.4499, "step": 1677 }, { "epoch": 1.6597428288822949, "grad_norm": 0.34756786268548084, "learning_rate": 2.4825943569072922e-05, "loss": 0.4892, "step": 1678 }, { "epoch": 1.6607319485657763, "grad_norm": 0.30828529971670104, "learning_rate": 2.480762183950165e-05, "loss": 0.408, "step": 1679 }, { "epoch": 1.6617210682492582, "grad_norm": 0.3146532160887173, "learning_rate": 2.478930010993038e-05, "loss": 0.4639, "step": 1680 }, { "epoch": 1.6627101879327397, "grad_norm": 0.30842421088723865, "learning_rate": 2.477097838035911e-05, "loss": 0.4362, "step": 1681 }, { "epoch": 1.6636993076162216, "grad_norm": 0.3318964750018238, "learning_rate": 2.4752656650787835e-05, "loss": 0.4272, "step": 1682 }, { "epoch": 1.6646884272997031, "grad_norm": 0.2719805738190358, "learning_rate": 2.4734334921216565e-05, "loss": 0.3888, "step": 1683 }, { "epoch": 1.665677546983185, "grad_norm": 0.310777229938151, "learning_rate": 2.4716013191645295e-05, "loss": 0.4384, "step": 1684 }, { "epoch": 1.6666666666666665, "grad_norm": 0.3443093523477439, "learning_rate": 2.469769146207402e-05, "loss": 0.4107, "step": 1685 }, { "epoch": 1.6676557863501484, "grad_norm": 0.27732248345294597, "learning_rate": 2.467936973250275e-05, "loss": 0.4154, "step": 1686 }, { "epoch": 1.66864490603363, "grad_norm": 0.3394675785807699, "learning_rate": 2.4661048002931477e-05, "loss": 0.5017, "step": 1687 }, { "epoch": 1.6696340257171118, "grad_norm": 0.33530005890339165, "learning_rate": 2.4642726273360207e-05, "loss": 0.4788, "step": 1688 }, { "epoch": 1.6706231454005933, "grad_norm": 0.3242349395154102, "learning_rate": 2.4624404543788933e-05, "loss": 0.4241, "step": 1689 }, { "epoch": 1.6716122650840752, "grad_norm": 0.30760783104340705, "learning_rate": 2.4606082814217663e-05, "loss": 0.4403, "step": 1690 }, { "epoch": 1.6726013847675567, "grad_norm": 0.33533199396862406, "learning_rate": 2.458776108464639e-05, "loss": 0.4568, "step": 1691 }, { "epoch": 1.6735905044510386, "grad_norm": 0.26868937017240524, "learning_rate": 2.456943935507512e-05, "loss": 0.4603, "step": 1692 }, { "epoch": 1.6745796241345203, "grad_norm": 0.27887094020722614, "learning_rate": 2.455111762550385e-05, "loss": 0.3977, "step": 1693 }, { "epoch": 1.675568743818002, "grad_norm": 0.2952611325359872, "learning_rate": 2.453279589593258e-05, "loss": 0.4352, "step": 1694 }, { "epoch": 1.6765578635014837, "grad_norm": 0.26972392989431493, "learning_rate": 2.4514474166361305e-05, "loss": 0.4404, "step": 1695 }, { "epoch": 1.6775469831849654, "grad_norm": 0.26276099105184353, "learning_rate": 2.4496152436790035e-05, "loss": 0.4099, "step": 1696 }, { "epoch": 1.6785361028684471, "grad_norm": 0.2834003933123841, "learning_rate": 2.4477830707218762e-05, "loss": 0.4044, "step": 1697 }, { "epoch": 1.6795252225519288, "grad_norm": 0.28788506496878463, "learning_rate": 2.445950897764749e-05, "loss": 0.3827, "step": 1698 }, { "epoch": 1.6805143422354105, "grad_norm": 0.31665468599783186, "learning_rate": 2.4441187248076218e-05, "loss": 0.4364, "step": 1699 }, { "epoch": 1.6815034619188922, "grad_norm": 0.2578234089735481, "learning_rate": 2.4422865518504948e-05, "loss": 0.3777, "step": 1700 }, { "epoch": 1.682492581602374, "grad_norm": 0.2979943738077502, "learning_rate": 2.4404543788933678e-05, "loss": 0.3914, "step": 1701 }, { "epoch": 1.6834817012858556, "grad_norm": 3.020828917743508, "learning_rate": 2.4386222059362404e-05, "loss": 0.3896, "step": 1702 }, { "epoch": 1.6844708209693373, "grad_norm": 0.3359718058064896, "learning_rate": 2.4367900329791134e-05, "loss": 0.4185, "step": 1703 }, { "epoch": 1.685459940652819, "grad_norm": 0.2994177996182077, "learning_rate": 2.434957860021986e-05, "loss": 0.4086, "step": 1704 }, { "epoch": 1.6864490603363007, "grad_norm": 0.2991038976312438, "learning_rate": 2.433125687064859e-05, "loss": 0.435, "step": 1705 }, { "epoch": 1.6874381800197824, "grad_norm": 0.28661212810149267, "learning_rate": 2.431293514107732e-05, "loss": 0.4143, "step": 1706 }, { "epoch": 1.688427299703264, "grad_norm": 0.37509122239648596, "learning_rate": 2.429461341150605e-05, "loss": 0.4484, "step": 1707 }, { "epoch": 1.6894164193867458, "grad_norm": 0.27836601910888614, "learning_rate": 2.4276291681934776e-05, "loss": 0.4142, "step": 1708 }, { "epoch": 1.6904055390702275, "grad_norm": 0.2782146839258225, "learning_rate": 2.4257969952363506e-05, "loss": 0.4015, "step": 1709 }, { "epoch": 1.6913946587537092, "grad_norm": 0.313916326384413, "learning_rate": 2.4239648222792232e-05, "loss": 0.4149, "step": 1710 }, { "epoch": 1.692383778437191, "grad_norm": 0.29787858936299944, "learning_rate": 2.4221326493220962e-05, "loss": 0.4177, "step": 1711 }, { "epoch": 1.6933728981206726, "grad_norm": 0.2902230168937217, "learning_rate": 2.420300476364969e-05, "loss": 0.4459, "step": 1712 }, { "epoch": 1.6943620178041543, "grad_norm": 0.2764799202205296, "learning_rate": 2.4184683034078418e-05, "loss": 0.4803, "step": 1713 }, { "epoch": 1.695351137487636, "grad_norm": 0.3045330992531217, "learning_rate": 2.4166361304507145e-05, "loss": 0.4842, "step": 1714 }, { "epoch": 1.6963402571711177, "grad_norm": 0.3029379637137786, "learning_rate": 2.4148039574935874e-05, "loss": 0.4554, "step": 1715 }, { "epoch": 1.6973293768545994, "grad_norm": 0.28227347456024454, "learning_rate": 2.41297178453646e-05, "loss": 0.4703, "step": 1716 }, { "epoch": 1.698318496538081, "grad_norm": 0.2943760092371024, "learning_rate": 2.4111396115793334e-05, "loss": 0.4384, "step": 1717 }, { "epoch": 1.6993076162215628, "grad_norm": 0.2985609138611823, "learning_rate": 2.409307438622206e-05, "loss": 0.4287, "step": 1718 }, { "epoch": 1.7002967359050445, "grad_norm": 0.26327465676428885, "learning_rate": 2.407475265665079e-05, "loss": 0.4177, "step": 1719 }, { "epoch": 1.7012858555885262, "grad_norm": 0.3178230796077746, "learning_rate": 2.4056430927079517e-05, "loss": 0.4423, "step": 1720 }, { "epoch": 1.7022749752720079, "grad_norm": 0.2814544147392274, "learning_rate": 2.4038109197508247e-05, "loss": 0.4657, "step": 1721 }, { "epoch": 1.7032640949554896, "grad_norm": 0.3002239090911129, "learning_rate": 2.4019787467936973e-05, "loss": 0.4673, "step": 1722 }, { "epoch": 1.7042532146389713, "grad_norm": 0.31781217494252273, "learning_rate": 2.4001465738365703e-05, "loss": 0.527, "step": 1723 }, { "epoch": 1.705242334322453, "grad_norm": 0.3498801258975987, "learning_rate": 2.3983144008794433e-05, "loss": 0.4487, "step": 1724 }, { "epoch": 1.7062314540059347, "grad_norm": 0.9087648133583388, "learning_rate": 2.396482227922316e-05, "loss": 0.3576, "step": 1725 }, { "epoch": 1.7072205736894164, "grad_norm": 0.3719846637041946, "learning_rate": 2.394650054965189e-05, "loss": 0.4582, "step": 1726 }, { "epoch": 1.708209693372898, "grad_norm": 0.35660510630355013, "learning_rate": 2.3928178820080615e-05, "loss": 0.4612, "step": 1727 }, { "epoch": 1.7091988130563798, "grad_norm": 0.33981496021373514, "learning_rate": 2.3909857090509345e-05, "loss": 0.4273, "step": 1728 }, { "epoch": 1.7101879327398615, "grad_norm": 0.3742138810240101, "learning_rate": 2.3891535360938075e-05, "loss": 0.4248, "step": 1729 }, { "epoch": 1.7111770524233432, "grad_norm": 0.36187927647193935, "learning_rate": 2.3873213631366805e-05, "loss": 0.4379, "step": 1730 }, { "epoch": 1.712166172106825, "grad_norm": 0.3017112790609432, "learning_rate": 2.385489190179553e-05, "loss": 0.4641, "step": 1731 }, { "epoch": 1.7131552917903066, "grad_norm": 0.3798548828676941, "learning_rate": 2.383657017222426e-05, "loss": 0.4185, "step": 1732 }, { "epoch": 1.7141444114737885, "grad_norm": 0.289413116025503, "learning_rate": 2.3818248442652987e-05, "loss": 0.3906, "step": 1733 }, { "epoch": 1.71513353115727, "grad_norm": 0.32233281303153377, "learning_rate": 2.3799926713081717e-05, "loss": 0.4263, "step": 1734 }, { "epoch": 1.7161226508407519, "grad_norm": 0.3006262895335519, "learning_rate": 2.3781604983510443e-05, "loss": 0.4834, "step": 1735 }, { "epoch": 1.7171117705242334, "grad_norm": 0.2618376951089462, "learning_rate": 2.3763283253939173e-05, "loss": 0.3999, "step": 1736 }, { "epoch": 1.7181008902077153, "grad_norm": 0.29424619057293433, "learning_rate": 2.37449615243679e-05, "loss": 0.4073, "step": 1737 }, { "epoch": 1.7190900098911968, "grad_norm": 0.3246152750212638, "learning_rate": 2.372663979479663e-05, "loss": 0.4299, "step": 1738 }, { "epoch": 1.7200791295746787, "grad_norm": 0.31160852845216713, "learning_rate": 2.3708318065225356e-05, "loss": 0.3979, "step": 1739 }, { "epoch": 1.7210682492581602, "grad_norm": 0.2755114326711097, "learning_rate": 2.3689996335654086e-05, "loss": 0.4641, "step": 1740 }, { "epoch": 1.722057368941642, "grad_norm": 0.30346825870958033, "learning_rate": 2.3671674606082816e-05, "loss": 0.4408, "step": 1741 }, { "epoch": 1.7230464886251236, "grad_norm": 0.30129194966583844, "learning_rate": 2.3653352876511545e-05, "loss": 0.3837, "step": 1742 }, { "epoch": 1.7240356083086055, "grad_norm": 0.2775790052611303, "learning_rate": 2.3635031146940272e-05, "loss": 0.4221, "step": 1743 }, { "epoch": 1.725024727992087, "grad_norm": 0.265693373028082, "learning_rate": 2.3616709417369e-05, "loss": 0.4168, "step": 1744 }, { "epoch": 1.7260138476755689, "grad_norm": 0.3022427971745998, "learning_rate": 2.3598387687797728e-05, "loss": 0.4021, "step": 1745 }, { "epoch": 1.7270029673590503, "grad_norm": 0.308247068877542, "learning_rate": 2.3580065958226458e-05, "loss": 0.4473, "step": 1746 }, { "epoch": 1.7279920870425323, "grad_norm": 0.29663390988995886, "learning_rate": 2.3561744228655188e-05, "loss": 0.4039, "step": 1747 }, { "epoch": 1.7289812067260137, "grad_norm": 0.2736150704186472, "learning_rate": 2.3543422499083914e-05, "loss": 0.4025, "step": 1748 }, { "epoch": 1.7299703264094957, "grad_norm": 2.823911700520109, "learning_rate": 2.3525100769512644e-05, "loss": 0.475, "step": 1749 }, { "epoch": 1.7309594460929771, "grad_norm": 0.34298388253711626, "learning_rate": 2.350677903994137e-05, "loss": 0.4595, "step": 1750 }, { "epoch": 1.731948565776459, "grad_norm": 0.30530109475341805, "learning_rate": 2.34884573103701e-05, "loss": 0.4368, "step": 1751 }, { "epoch": 1.7329376854599405, "grad_norm": 0.32890647032684833, "learning_rate": 2.347013558079883e-05, "loss": 0.4635, "step": 1752 }, { "epoch": 1.7339268051434225, "grad_norm": 0.2888669372975163, "learning_rate": 2.345181385122756e-05, "loss": 0.3912, "step": 1753 }, { "epoch": 1.734915924826904, "grad_norm": 0.29633911242582683, "learning_rate": 2.3433492121656286e-05, "loss": 0.3674, "step": 1754 }, { "epoch": 1.7359050445103859, "grad_norm": 0.26951749995769536, "learning_rate": 2.3415170392085016e-05, "loss": 0.4199, "step": 1755 }, { "epoch": 1.7368941641938673, "grad_norm": 0.3749803881291908, "learning_rate": 2.3396848662513742e-05, "loss": 0.483, "step": 1756 }, { "epoch": 1.7378832838773492, "grad_norm": 0.32853671133944534, "learning_rate": 2.3378526932942472e-05, "loss": 0.4501, "step": 1757 }, { "epoch": 1.7388724035608307, "grad_norm": 0.3443696483010131, "learning_rate": 2.33602052033712e-05, "loss": 0.499, "step": 1758 }, { "epoch": 1.7398615232443126, "grad_norm": 0.3145778345363139, "learning_rate": 2.334188347379993e-05, "loss": 0.3949, "step": 1759 }, { "epoch": 1.7408506429277941, "grad_norm": 0.319931054528546, "learning_rate": 2.3323561744228655e-05, "loss": 0.3834, "step": 1760 }, { "epoch": 1.741839762611276, "grad_norm": 0.2862423758324553, "learning_rate": 2.3305240014657385e-05, "loss": 0.4465, "step": 1761 }, { "epoch": 1.7428288822947575, "grad_norm": 0.30485977879966464, "learning_rate": 2.328691828508611e-05, "loss": 0.401, "step": 1762 }, { "epoch": 1.7438180019782394, "grad_norm": 0.3165930256651749, "learning_rate": 2.326859655551484e-05, "loss": 0.4678, "step": 1763 }, { "epoch": 1.744807121661721, "grad_norm": 0.29246729099227736, "learning_rate": 2.325027482594357e-05, "loss": 0.3993, "step": 1764 }, { "epoch": 1.7457962413452028, "grad_norm": 0.31561117841951253, "learning_rate": 2.32319530963723e-05, "loss": 0.4925, "step": 1765 }, { "epoch": 1.7467853610286843, "grad_norm": 0.2888481565602988, "learning_rate": 2.3213631366801027e-05, "loss": 0.4381, "step": 1766 }, { "epoch": 1.7477744807121662, "grad_norm": 0.2868705156659709, "learning_rate": 2.3195309637229757e-05, "loss": 0.3683, "step": 1767 }, { "epoch": 1.7487636003956477, "grad_norm": 0.3782601564035021, "learning_rate": 2.3176987907658483e-05, "loss": 0.4532, "step": 1768 }, { "epoch": 1.7497527200791296, "grad_norm": 0.2704871280008665, "learning_rate": 2.3158666178087213e-05, "loss": 0.4201, "step": 1769 }, { "epoch": 1.7507418397626113, "grad_norm": 0.3875138252219907, "learning_rate": 2.3140344448515943e-05, "loss": 0.4805, "step": 1770 }, { "epoch": 1.751730959446093, "grad_norm": 0.5492745343372892, "learning_rate": 2.312202271894467e-05, "loss": 0.4156, "step": 1771 }, { "epoch": 1.7527200791295747, "grad_norm": 0.3204756187656851, "learning_rate": 2.31037009893734e-05, "loss": 0.4988, "step": 1772 }, { "epoch": 1.7537091988130564, "grad_norm": 0.35278429265769184, "learning_rate": 2.3085379259802125e-05, "loss": 0.4668, "step": 1773 }, { "epoch": 1.7546983184965381, "grad_norm": 0.2818358328142832, "learning_rate": 2.3067057530230855e-05, "loss": 0.4493, "step": 1774 }, { "epoch": 1.7556874381800198, "grad_norm": 0.3770205606309304, "learning_rate": 2.304873580065958e-05, "loss": 0.4706, "step": 1775 }, { "epoch": 1.7566765578635015, "grad_norm": 0.2887530974485894, "learning_rate": 2.3030414071088315e-05, "loss": 0.4208, "step": 1776 }, { "epoch": 1.7576656775469832, "grad_norm": 0.38165004798947394, "learning_rate": 2.301209234151704e-05, "loss": 0.4299, "step": 1777 }, { "epoch": 1.758654797230465, "grad_norm": 0.30821868136973407, "learning_rate": 2.299377061194577e-05, "loss": 0.4569, "step": 1778 }, { "epoch": 1.7596439169139466, "grad_norm": 0.24167543739660413, "learning_rate": 2.2975448882374497e-05, "loss": 0.3777, "step": 1779 }, { "epoch": 1.7606330365974283, "grad_norm": 0.40409075639658476, "learning_rate": 2.2957127152803227e-05, "loss": 0.4594, "step": 1780 }, { "epoch": 1.76162215628091, "grad_norm": 0.3254914242691898, "learning_rate": 2.2938805423231954e-05, "loss": 0.4206, "step": 1781 }, { "epoch": 1.7626112759643917, "grad_norm": 2.509631711790463, "learning_rate": 2.2920483693660683e-05, "loss": 0.4511, "step": 1782 }, { "epoch": 1.7636003956478734, "grad_norm": 0.3808689413851784, "learning_rate": 2.290216196408941e-05, "loss": 0.4229, "step": 1783 }, { "epoch": 1.764589515331355, "grad_norm": 0.318175279111379, "learning_rate": 2.288384023451814e-05, "loss": 0.4416, "step": 1784 }, { "epoch": 1.7655786350148368, "grad_norm": 0.26964885093490887, "learning_rate": 2.2865518504946866e-05, "loss": 0.4192, "step": 1785 }, { "epoch": 1.7665677546983185, "grad_norm": 0.2632332792155176, "learning_rate": 2.2847196775375596e-05, "loss": 0.4172, "step": 1786 }, { "epoch": 1.7675568743818002, "grad_norm": 0.4047127793846293, "learning_rate": 2.2828875045804322e-05, "loss": 0.4712, "step": 1787 }, { "epoch": 1.768545994065282, "grad_norm": 0.27723739287491106, "learning_rate": 2.2810553316233055e-05, "loss": 0.4164, "step": 1788 }, { "epoch": 1.7695351137487636, "grad_norm": 0.3030859442151421, "learning_rate": 2.2792231586661782e-05, "loss": 0.3961, "step": 1789 }, { "epoch": 1.7705242334322453, "grad_norm": 0.8577754720604038, "learning_rate": 2.277390985709051e-05, "loss": 0.504, "step": 1790 }, { "epoch": 1.771513353115727, "grad_norm": 0.3242162051396364, "learning_rate": 2.2755588127519238e-05, "loss": 0.4342, "step": 1791 }, { "epoch": 1.7725024727992087, "grad_norm": 0.3408702815957067, "learning_rate": 2.2737266397947968e-05, "loss": 0.5057, "step": 1792 }, { "epoch": 1.7734915924826904, "grad_norm": 0.3542183072173877, "learning_rate": 2.2718944668376698e-05, "loss": 0.4232, "step": 1793 }, { "epoch": 1.774480712166172, "grad_norm": 0.4235624797543405, "learning_rate": 2.2700622938805424e-05, "loss": 0.437, "step": 1794 }, { "epoch": 1.7754698318496538, "grad_norm": 0.2789629789323769, "learning_rate": 2.2682301209234154e-05, "loss": 0.4422, "step": 1795 }, { "epoch": 1.7764589515331355, "grad_norm": 0.31066306585084447, "learning_rate": 2.266397947966288e-05, "loss": 0.4424, "step": 1796 }, { "epoch": 1.7774480712166172, "grad_norm": 0.33933834935357965, "learning_rate": 2.264565775009161e-05, "loss": 0.4203, "step": 1797 }, { "epoch": 1.7784371909000989, "grad_norm": 0.32712238177562997, "learning_rate": 2.2627336020520337e-05, "loss": 0.4319, "step": 1798 }, { "epoch": 1.7794263105835806, "grad_norm": 0.3040834782849549, "learning_rate": 2.2609014290949066e-05, "loss": 0.4453, "step": 1799 }, { "epoch": 1.7804154302670623, "grad_norm": 0.30022108803178027, "learning_rate": 2.2590692561377796e-05, "loss": 0.4445, "step": 1800 }, { "epoch": 1.781404549950544, "grad_norm": 0.3088237766977971, "learning_rate": 2.2572370831806526e-05, "loss": 0.425, "step": 1801 }, { "epoch": 1.7823936696340257, "grad_norm": 0.25065884893036056, "learning_rate": 2.2554049102235252e-05, "loss": 0.4144, "step": 1802 }, { "epoch": 1.7833827893175074, "grad_norm": 0.2975123258434565, "learning_rate": 2.2535727372663982e-05, "loss": 0.4606, "step": 1803 }, { "epoch": 1.784371909000989, "grad_norm": 3.0916522188183118, "learning_rate": 2.251740564309271e-05, "loss": 0.5796, "step": 1804 }, { "epoch": 1.7853610286844708, "grad_norm": 0.4161541547275752, "learning_rate": 2.249908391352144e-05, "loss": 0.4224, "step": 1805 }, { "epoch": 1.7863501483679525, "grad_norm": 0.4190616672823314, "learning_rate": 2.2480762183950165e-05, "loss": 0.4956, "step": 1806 }, { "epoch": 1.7873392680514342, "grad_norm": 0.7468045943261583, "learning_rate": 2.2462440454378895e-05, "loss": 0.506, "step": 1807 }, { "epoch": 1.7883283877349159, "grad_norm": 0.39878637991681054, "learning_rate": 2.244411872480762e-05, "loss": 0.4582, "step": 1808 }, { "epoch": 1.7893175074183976, "grad_norm": 0.3939410722134704, "learning_rate": 2.242579699523635e-05, "loss": 0.4213, "step": 1809 }, { "epoch": 1.7903066271018795, "grad_norm": 0.35893540238509397, "learning_rate": 2.2407475265665077e-05, "loss": 0.4452, "step": 1810 }, { "epoch": 1.791295746785361, "grad_norm": 0.3179290020697894, "learning_rate": 2.2389153536093807e-05, "loss": 0.4307, "step": 1811 }, { "epoch": 1.7922848664688429, "grad_norm": 0.37222861438373844, "learning_rate": 2.2370831806522537e-05, "loss": 0.4481, "step": 1812 }, { "epoch": 1.7932739861523244, "grad_norm": 0.4758358116814864, "learning_rate": 2.2352510076951267e-05, "loss": 0.4565, "step": 1813 }, { "epoch": 1.7942631058358063, "grad_norm": 0.30087252373022794, "learning_rate": 2.2334188347379993e-05, "loss": 0.4506, "step": 1814 }, { "epoch": 1.7952522255192878, "grad_norm": 0.38966070557634785, "learning_rate": 2.2315866617808723e-05, "loss": 0.4129, "step": 1815 }, { "epoch": 1.7962413452027697, "grad_norm": 0.4977116302980233, "learning_rate": 2.229754488823745e-05, "loss": 0.4218, "step": 1816 }, { "epoch": 1.7972304648862512, "grad_norm": 0.3265049192175431, "learning_rate": 2.227922315866618e-05, "loss": 0.418, "step": 1817 }, { "epoch": 1.798219584569733, "grad_norm": 0.3832944235691656, "learning_rate": 2.226090142909491e-05, "loss": 0.4684, "step": 1818 }, { "epoch": 1.7992087042532146, "grad_norm": 0.2958689650940704, "learning_rate": 2.2242579699523635e-05, "loss": 0.4074, "step": 1819 }, { "epoch": 1.8001978239366965, "grad_norm": 0.37038788365942676, "learning_rate": 2.2224257969952365e-05, "loss": 0.4259, "step": 1820 }, { "epoch": 1.801186943620178, "grad_norm": 0.3553791418334394, "learning_rate": 2.220593624038109e-05, "loss": 0.4669, "step": 1821 }, { "epoch": 1.8021760633036599, "grad_norm": 0.29718199661262223, "learning_rate": 2.218761451080982e-05, "loss": 0.4221, "step": 1822 }, { "epoch": 1.8031651829871413, "grad_norm": 0.3359429382037479, "learning_rate": 2.2169292781238548e-05, "loss": 0.397, "step": 1823 }, { "epoch": 1.8041543026706233, "grad_norm": 0.3048041433497428, "learning_rate": 2.215097105166728e-05, "loss": 0.4191, "step": 1824 }, { "epoch": 1.8051434223541047, "grad_norm": 3.3517379572451342, "learning_rate": 2.2132649322096007e-05, "loss": 0.4819, "step": 1825 }, { "epoch": 1.8061325420375867, "grad_norm": 0.2867123953220445, "learning_rate": 2.2114327592524737e-05, "loss": 0.4172, "step": 1826 }, { "epoch": 1.8071216617210681, "grad_norm": 0.2871478819593749, "learning_rate": 2.2096005862953464e-05, "loss": 0.3971, "step": 1827 }, { "epoch": 1.80811078140455, "grad_norm": 0.3245949025237984, "learning_rate": 2.2077684133382193e-05, "loss": 0.4968, "step": 1828 }, { "epoch": 1.8090999010880315, "grad_norm": 25.78091323351991, "learning_rate": 2.205936240381092e-05, "loss": 0.693, "step": 1829 }, { "epoch": 1.8100890207715135, "grad_norm": 0.49724123565643946, "learning_rate": 2.204104067423965e-05, "loss": 0.4492, "step": 1830 }, { "epoch": 1.811078140454995, "grad_norm": 0.3303838283801352, "learning_rate": 2.2022718944668376e-05, "loss": 0.4216, "step": 1831 }, { "epoch": 1.8120672601384769, "grad_norm": 0.3510098031005022, "learning_rate": 2.2004397215097106e-05, "loss": 0.4758, "step": 1832 }, { "epoch": 1.8130563798219583, "grad_norm": 0.2829388974092525, "learning_rate": 2.1986075485525832e-05, "loss": 0.4332, "step": 1833 }, { "epoch": 1.8140454995054403, "grad_norm": 0.29997263346647157, "learning_rate": 2.1967753755954562e-05, "loss": 0.4289, "step": 1834 }, { "epoch": 1.8150346191889217, "grad_norm": 0.31669510053999383, "learning_rate": 2.1949432026383292e-05, "loss": 0.4411, "step": 1835 }, { "epoch": 1.8160237388724036, "grad_norm": 0.29684511655355245, "learning_rate": 2.1931110296812022e-05, "loss": 0.407, "step": 1836 }, { "epoch": 1.8170128585558851, "grad_norm": 0.6958017772881732, "learning_rate": 2.1912788567240748e-05, "loss": 0.4468, "step": 1837 }, { "epoch": 1.818001978239367, "grad_norm": 0.336206373063671, "learning_rate": 2.1894466837669478e-05, "loss": 0.4296, "step": 1838 }, { "epoch": 1.8189910979228485, "grad_norm": 0.38504542640419415, "learning_rate": 2.1876145108098204e-05, "loss": 0.4059, "step": 1839 }, { "epoch": 1.8199802176063304, "grad_norm": 0.32789250753473015, "learning_rate": 2.1857823378526934e-05, "loss": 0.4773, "step": 1840 }, { "epoch": 1.820969337289812, "grad_norm": 0.3200012365592021, "learning_rate": 2.1839501648955664e-05, "loss": 0.3923, "step": 1841 }, { "epoch": 1.8219584569732938, "grad_norm": 0.45827677015168833, "learning_rate": 2.182117991938439e-05, "loss": 0.4699, "step": 1842 }, { "epoch": 1.8229475766567753, "grad_norm": 0.30103049669909526, "learning_rate": 2.180285818981312e-05, "loss": 0.4359, "step": 1843 }, { "epoch": 1.8239366963402572, "grad_norm": 0.297495772208957, "learning_rate": 2.1784536460241847e-05, "loss": 0.4252, "step": 1844 }, { "epoch": 1.8249258160237387, "grad_norm": 0.3680654701793383, "learning_rate": 2.1766214730670576e-05, "loss": 0.458, "step": 1845 }, { "epoch": 1.8259149357072206, "grad_norm": 0.3403708954708426, "learning_rate": 2.1747893001099303e-05, "loss": 0.444, "step": 1846 }, { "epoch": 1.826904055390702, "grad_norm": 0.2803891604745382, "learning_rate": 2.1729571271528033e-05, "loss": 0.4229, "step": 1847 }, { "epoch": 1.827893175074184, "grad_norm": 0.2997125783852745, "learning_rate": 2.1711249541956762e-05, "loss": 0.4057, "step": 1848 }, { "epoch": 1.8288822947576657, "grad_norm": 0.3371508195438753, "learning_rate": 2.1692927812385492e-05, "loss": 0.4581, "step": 1849 }, { "epoch": 1.8298714144411474, "grad_norm": 0.2822053601448072, "learning_rate": 2.167460608281422e-05, "loss": 0.4262, "step": 1850 }, { "epoch": 1.8308605341246291, "grad_norm": 0.33119023555410904, "learning_rate": 2.165628435324295e-05, "loss": 0.4673, "step": 1851 }, { "epoch": 1.8318496538081108, "grad_norm": 0.2982548956831065, "learning_rate": 2.1637962623671675e-05, "loss": 0.4265, "step": 1852 }, { "epoch": 1.8328387734915925, "grad_norm": 0.2861624181648438, "learning_rate": 2.1619640894100405e-05, "loss": 0.4637, "step": 1853 }, { "epoch": 1.8338278931750742, "grad_norm": 0.27624749764651596, "learning_rate": 2.160131916452913e-05, "loss": 0.378, "step": 1854 }, { "epoch": 1.834817012858556, "grad_norm": 0.30172044665288217, "learning_rate": 2.158299743495786e-05, "loss": 0.442, "step": 1855 }, { "epoch": 1.8358061325420376, "grad_norm": 0.8175953310300249, "learning_rate": 2.1564675705386587e-05, "loss": 0.5055, "step": 1856 }, { "epoch": 1.8367952522255193, "grad_norm": 0.30746248179532876, "learning_rate": 2.1546353975815317e-05, "loss": 0.45, "step": 1857 }, { "epoch": 1.837784371909001, "grad_norm": 0.3408014710812103, "learning_rate": 2.1528032246244047e-05, "loss": 0.4448, "step": 1858 }, { "epoch": 1.8387734915924827, "grad_norm": 0.6207727087629815, "learning_rate": 2.1509710516672773e-05, "loss": 0.4727, "step": 1859 }, { "epoch": 1.8397626112759644, "grad_norm": 0.4225278421377986, "learning_rate": 2.1491388787101503e-05, "loss": 0.4452, "step": 1860 }, { "epoch": 1.840751730959446, "grad_norm": 0.4685555138093697, "learning_rate": 2.1473067057530233e-05, "loss": 0.4202, "step": 1861 }, { "epoch": 1.8417408506429278, "grad_norm": 0.2891269631180874, "learning_rate": 2.145474532795896e-05, "loss": 0.4393, "step": 1862 }, { "epoch": 1.8427299703264095, "grad_norm": 0.37223238502196465, "learning_rate": 2.143642359838769e-05, "loss": 0.3935, "step": 1863 }, { "epoch": 1.8437190900098912, "grad_norm": 0.3478084518953676, "learning_rate": 2.141810186881642e-05, "loss": 0.4112, "step": 1864 }, { "epoch": 1.844708209693373, "grad_norm": 0.2905645567287115, "learning_rate": 2.1399780139245145e-05, "loss": 0.4286, "step": 1865 }, { "epoch": 1.8456973293768546, "grad_norm": 0.37515837566952076, "learning_rate": 2.1381458409673875e-05, "loss": 0.5235, "step": 1866 }, { "epoch": 1.8466864490603363, "grad_norm": 0.2951545143923494, "learning_rate": 2.13631366801026e-05, "loss": 0.5067, "step": 1867 }, { "epoch": 1.847675568743818, "grad_norm": 0.2948346426391745, "learning_rate": 2.134481495053133e-05, "loss": 0.3966, "step": 1868 }, { "epoch": 1.8486646884272997, "grad_norm": 0.41530734276356635, "learning_rate": 2.1326493220960058e-05, "loss": 0.4632, "step": 1869 }, { "epoch": 1.8496538081107814, "grad_norm": 0.3147529275156093, "learning_rate": 2.1308171491388788e-05, "loss": 0.4547, "step": 1870 }, { "epoch": 1.850642927794263, "grad_norm": 0.2927562274139951, "learning_rate": 2.1289849761817514e-05, "loss": 0.3962, "step": 1871 }, { "epoch": 1.8516320474777448, "grad_norm": 0.3657012517786323, "learning_rate": 2.1271528032246247e-05, "loss": 0.4178, "step": 1872 }, { "epoch": 1.8526211671612265, "grad_norm": 0.2740740734494427, "learning_rate": 2.1253206302674974e-05, "loss": 0.4467, "step": 1873 }, { "epoch": 1.8536102868447082, "grad_norm": 0.3075673256037017, "learning_rate": 2.1234884573103704e-05, "loss": 0.4186, "step": 1874 }, { "epoch": 1.8545994065281899, "grad_norm": 0.2920268194596183, "learning_rate": 2.121656284353243e-05, "loss": 0.3896, "step": 1875 }, { "epoch": 1.8555885262116716, "grad_norm": 0.27201039572059627, "learning_rate": 2.119824111396116e-05, "loss": 0.4085, "step": 1876 }, { "epoch": 1.8565776458951533, "grad_norm": 0.353148381804933, "learning_rate": 2.1179919384389886e-05, "loss": 0.4426, "step": 1877 }, { "epoch": 1.857566765578635, "grad_norm": 8.40808607244907, "learning_rate": 2.1161597654818616e-05, "loss": 0.4798, "step": 1878 }, { "epoch": 1.8585558852621167, "grad_norm": 0.267897511367695, "learning_rate": 2.1143275925247342e-05, "loss": 0.4339, "step": 1879 }, { "epoch": 1.8595450049455984, "grad_norm": 0.31654957685044094, "learning_rate": 2.1124954195676072e-05, "loss": 0.4733, "step": 1880 }, { "epoch": 1.86053412462908, "grad_norm": 0.3027075365016889, "learning_rate": 2.1106632466104802e-05, "loss": 0.4363, "step": 1881 }, { "epoch": 1.8615232443125618, "grad_norm": 0.31790990073853925, "learning_rate": 2.108831073653353e-05, "loss": 0.4105, "step": 1882 }, { "epoch": 1.8625123639960435, "grad_norm": 0.2843270321617449, "learning_rate": 2.1069989006962258e-05, "loss": 0.4514, "step": 1883 }, { "epoch": 1.8635014836795252, "grad_norm": 0.3044166271116939, "learning_rate": 2.1051667277390988e-05, "loss": 0.4842, "step": 1884 }, { "epoch": 1.8644906033630069, "grad_norm": 0.3358497677903399, "learning_rate": 2.1033345547819714e-05, "loss": 0.4455, "step": 1885 }, { "epoch": 1.8654797230464886, "grad_norm": 0.30731251689927197, "learning_rate": 2.1015023818248444e-05, "loss": 0.4451, "step": 1886 }, { "epoch": 1.8664688427299705, "grad_norm": 0.3143692937530264, "learning_rate": 2.0996702088677174e-05, "loss": 0.4269, "step": 1887 }, { "epoch": 1.867457962413452, "grad_norm": 0.2834940816390246, "learning_rate": 2.09783803591059e-05, "loss": 0.412, "step": 1888 }, { "epoch": 1.8684470820969339, "grad_norm": 0.2788270677440903, "learning_rate": 2.096005862953463e-05, "loss": 0.3973, "step": 1889 }, { "epoch": 1.8694362017804154, "grad_norm": 0.3078569868447692, "learning_rate": 2.0941736899963357e-05, "loss": 0.4802, "step": 1890 }, { "epoch": 1.8704253214638973, "grad_norm": 0.2882271462085658, "learning_rate": 2.0923415170392087e-05, "loss": 0.4506, "step": 1891 }, { "epoch": 1.8714144411473788, "grad_norm": 0.2768064460294965, "learning_rate": 2.0905093440820813e-05, "loss": 0.4525, "step": 1892 }, { "epoch": 1.8724035608308607, "grad_norm": 0.30177959635692414, "learning_rate": 2.0886771711249543e-05, "loss": 0.4446, "step": 1893 }, { "epoch": 1.8733926805143422, "grad_norm": 0.30727441186455273, "learning_rate": 2.086844998167827e-05, "loss": 0.4138, "step": 1894 }, { "epoch": 1.874381800197824, "grad_norm": 0.3029843395912454, "learning_rate": 2.0850128252107002e-05, "loss": 0.467, "step": 1895 }, { "epoch": 1.8753709198813056, "grad_norm": 0.28610816219099716, "learning_rate": 2.083180652253573e-05, "loss": 0.4103, "step": 1896 }, { "epoch": 1.8763600395647875, "grad_norm": 0.26564180151467914, "learning_rate": 2.081348479296446e-05, "loss": 0.413, "step": 1897 }, { "epoch": 1.877349159248269, "grad_norm": 0.32891431091901296, "learning_rate": 2.0795163063393185e-05, "loss": 0.4131, "step": 1898 }, { "epoch": 1.8783382789317509, "grad_norm": 0.2655039622432033, "learning_rate": 2.0776841333821915e-05, "loss": 0.421, "step": 1899 }, { "epoch": 1.8793273986152323, "grad_norm": 0.3009633138938526, "learning_rate": 2.075851960425064e-05, "loss": 0.4188, "step": 1900 }, { "epoch": 1.8803165182987143, "grad_norm": 0.29124656986978736, "learning_rate": 2.074019787467937e-05, "loss": 0.4162, "step": 1901 }, { "epoch": 1.8813056379821957, "grad_norm": 0.30884019091654086, "learning_rate": 2.0721876145108097e-05, "loss": 0.4565, "step": 1902 }, { "epoch": 1.8822947576656777, "grad_norm": 0.30522532353130594, "learning_rate": 2.0703554415536827e-05, "loss": 0.5036, "step": 1903 }, { "epoch": 1.8832838773491591, "grad_norm": 0.3076309070429283, "learning_rate": 2.0685232685965557e-05, "loss": 0.4343, "step": 1904 }, { "epoch": 1.884272997032641, "grad_norm": 0.3451728446476686, "learning_rate": 2.0666910956394283e-05, "loss": 0.4339, "step": 1905 }, { "epoch": 1.8852621167161225, "grad_norm": 0.28959111272819593, "learning_rate": 2.0648589226823013e-05, "loss": 0.4249, "step": 1906 }, { "epoch": 1.8862512363996045, "grad_norm": 0.30354422997001196, "learning_rate": 2.0630267497251743e-05, "loss": 0.4156, "step": 1907 }, { "epoch": 1.887240356083086, "grad_norm": 0.3085053722428896, "learning_rate": 2.061194576768047e-05, "loss": 0.4465, "step": 1908 }, { "epoch": 1.8882294757665679, "grad_norm": 0.3437045799968937, "learning_rate": 2.05936240381092e-05, "loss": 0.4523, "step": 1909 }, { "epoch": 1.8892185954500493, "grad_norm": 0.28581058747896737, "learning_rate": 2.057530230853793e-05, "loss": 0.427, "step": 1910 }, { "epoch": 1.8902077151335313, "grad_norm": 0.271508864393249, "learning_rate": 2.0556980578966656e-05, "loss": 0.3641, "step": 1911 }, { "epoch": 1.8911968348170127, "grad_norm": 0.35199224588959227, "learning_rate": 2.0538658849395385e-05, "loss": 0.4209, "step": 1912 }, { "epoch": 1.8921859545004946, "grad_norm": 0.2953225794878225, "learning_rate": 2.0520337119824112e-05, "loss": 0.3718, "step": 1913 }, { "epoch": 1.8931750741839761, "grad_norm": 0.2738162952704808, "learning_rate": 2.050201539025284e-05, "loss": 0.3926, "step": 1914 }, { "epoch": 1.894164193867458, "grad_norm": 0.26534375180654185, "learning_rate": 2.0483693660681568e-05, "loss": 0.3923, "step": 1915 }, { "epoch": 1.8951533135509395, "grad_norm": 0.2631897845252683, "learning_rate": 2.0465371931110298e-05, "loss": 0.3955, "step": 1916 }, { "epoch": 1.8961424332344214, "grad_norm": 0.2942843582306155, "learning_rate": 2.0447050201539024e-05, "loss": 0.419, "step": 1917 }, { "epoch": 1.897131552917903, "grad_norm": 0.30867929917338793, "learning_rate": 2.0428728471967754e-05, "loss": 0.4338, "step": 1918 }, { "epoch": 1.8981206726013848, "grad_norm": 0.2990621787095512, "learning_rate": 2.0410406742396484e-05, "loss": 0.4383, "step": 1919 }, { "epoch": 1.8991097922848663, "grad_norm": 0.27810473804617325, "learning_rate": 2.0392085012825214e-05, "loss": 0.4491, "step": 1920 }, { "epoch": 1.9000989119683482, "grad_norm": 0.33382076803749, "learning_rate": 2.037376328325394e-05, "loss": 0.4381, "step": 1921 }, { "epoch": 1.9010880316518297, "grad_norm": 0.3304112930354128, "learning_rate": 2.035544155368267e-05, "loss": 0.5064, "step": 1922 }, { "epoch": 1.9020771513353116, "grad_norm": 0.25783866601539174, "learning_rate": 2.0337119824111396e-05, "loss": 0.3897, "step": 1923 }, { "epoch": 1.903066271018793, "grad_norm": 0.27328280868082716, "learning_rate": 2.0318798094540126e-05, "loss": 0.4383, "step": 1924 }, { "epoch": 1.904055390702275, "grad_norm": 0.33204613407624645, "learning_rate": 2.0300476364968852e-05, "loss": 0.5073, "step": 1925 }, { "epoch": 1.9050445103857567, "grad_norm": 0.2766857216599759, "learning_rate": 2.0282154635397582e-05, "loss": 0.4149, "step": 1926 }, { "epoch": 1.9060336300692384, "grad_norm": 0.266461589601163, "learning_rate": 2.0263832905826312e-05, "loss": 0.4314, "step": 1927 }, { "epoch": 1.9070227497527201, "grad_norm": 0.2512857349198211, "learning_rate": 2.024551117625504e-05, "loss": 0.4336, "step": 1928 }, { "epoch": 1.9080118694362018, "grad_norm": 0.33637530648765923, "learning_rate": 2.022718944668377e-05, "loss": 0.4225, "step": 1929 }, { "epoch": 1.9090009891196835, "grad_norm": 0.28208217764150173, "learning_rate": 2.0208867717112495e-05, "loss": 0.4725, "step": 1930 }, { "epoch": 1.9099901088031652, "grad_norm": 0.26065398592400135, "learning_rate": 2.0190545987541225e-05, "loss": 0.4567, "step": 1931 }, { "epoch": 1.910979228486647, "grad_norm": 0.28479843394920357, "learning_rate": 2.0172224257969954e-05, "loss": 0.4287, "step": 1932 }, { "epoch": 1.9119683481701286, "grad_norm": 3.294579825793158, "learning_rate": 2.0153902528398684e-05, "loss": 0.5882, "step": 1933 }, { "epoch": 1.9129574678536103, "grad_norm": 0.28936302581206536, "learning_rate": 2.013558079882741e-05, "loss": 0.5019, "step": 1934 }, { "epoch": 1.913946587537092, "grad_norm": 0.28047234749246863, "learning_rate": 2.011725906925614e-05, "loss": 0.4326, "step": 1935 }, { "epoch": 1.9149357072205737, "grad_norm": 0.3100419020654506, "learning_rate": 2.0098937339684867e-05, "loss": 0.4336, "step": 1936 }, { "epoch": 1.9159248269040554, "grad_norm": 0.30316063562527656, "learning_rate": 2.0080615610113597e-05, "loss": 0.4482, "step": 1937 }, { "epoch": 1.916913946587537, "grad_norm": 0.3203458184728088, "learning_rate": 2.0062293880542323e-05, "loss": 0.4761, "step": 1938 }, { "epoch": 1.9179030662710188, "grad_norm": 0.319115811593246, "learning_rate": 2.0043972150971053e-05, "loss": 0.4387, "step": 1939 }, { "epoch": 1.9188921859545005, "grad_norm": 0.29266033140236186, "learning_rate": 2.002565042139978e-05, "loss": 0.4128, "step": 1940 }, { "epoch": 1.9198813056379822, "grad_norm": 0.2934869021796227, "learning_rate": 2.000732869182851e-05, "loss": 0.4266, "step": 1941 }, { "epoch": 1.920870425321464, "grad_norm": 0.2954310231819758, "learning_rate": 1.9989006962257235e-05, "loss": 0.4862, "step": 1942 }, { "epoch": 1.9218595450049456, "grad_norm": 0.2989192690361351, "learning_rate": 1.997068523268597e-05, "loss": 0.4284, "step": 1943 }, { "epoch": 1.9228486646884273, "grad_norm": 0.32532736480103575, "learning_rate": 1.9952363503114695e-05, "loss": 0.5123, "step": 1944 }, { "epoch": 1.923837784371909, "grad_norm": 0.279809147331516, "learning_rate": 1.9934041773543425e-05, "loss": 0.4593, "step": 1945 }, { "epoch": 1.9248269040553907, "grad_norm": 0.3043919655682651, "learning_rate": 1.991572004397215e-05, "loss": 0.4417, "step": 1946 }, { "epoch": 1.9258160237388724, "grad_norm": 0.2795337887573538, "learning_rate": 1.989739831440088e-05, "loss": 0.4287, "step": 1947 }, { "epoch": 1.926805143422354, "grad_norm": 0.26009593666483405, "learning_rate": 1.9879076584829608e-05, "loss": 0.4479, "step": 1948 }, { "epoch": 1.9277942631058358, "grad_norm": 0.25306492114507956, "learning_rate": 1.9860754855258337e-05, "loss": 0.4074, "step": 1949 }, { "epoch": 1.9287833827893175, "grad_norm": 0.2706763584509426, "learning_rate": 1.9842433125687067e-05, "loss": 0.4687, "step": 1950 }, { "epoch": 1.9297725024727992, "grad_norm": 0.2546219655402372, "learning_rate": 1.9824111396115794e-05, "loss": 0.3809, "step": 1951 }, { "epoch": 1.9307616221562809, "grad_norm": 0.2917574146301412, "learning_rate": 1.9805789666544523e-05, "loss": 0.4002, "step": 1952 }, { "epoch": 1.9317507418397626, "grad_norm": 0.914811674910379, "learning_rate": 1.978746793697325e-05, "loss": 0.4656, "step": 1953 }, { "epoch": 1.9327398615232443, "grad_norm": 0.2783013811561288, "learning_rate": 1.976914620740198e-05, "loss": 0.4792, "step": 1954 }, { "epoch": 1.933728981206726, "grad_norm": 0.28602656481136324, "learning_rate": 1.975082447783071e-05, "loss": 0.4541, "step": 1955 }, { "epoch": 1.9347181008902077, "grad_norm": 0.2592620006394712, "learning_rate": 1.973250274825944e-05, "loss": 0.4348, "step": 1956 }, { "epoch": 1.9357072205736894, "grad_norm": 0.2773762643584451, "learning_rate": 1.9714181018688166e-05, "loss": 0.4063, "step": 1957 }, { "epoch": 1.936696340257171, "grad_norm": 0.28599821633612305, "learning_rate": 1.9695859289116895e-05, "loss": 0.4824, "step": 1958 }, { "epoch": 1.9376854599406528, "grad_norm": 0.26672204401496125, "learning_rate": 1.9677537559545622e-05, "loss": 0.4232, "step": 1959 }, { "epoch": 1.9386745796241345, "grad_norm": 0.2892625352799071, "learning_rate": 1.965921582997435e-05, "loss": 0.4796, "step": 1960 }, { "epoch": 1.9396636993076162, "grad_norm": 0.2817788199429448, "learning_rate": 1.9640894100403078e-05, "loss": 0.402, "step": 1961 }, { "epoch": 1.9406528189910979, "grad_norm": 0.2653710321950093, "learning_rate": 1.9622572370831808e-05, "loss": 0.4031, "step": 1962 }, { "epoch": 1.9416419386745796, "grad_norm": 0.313468529227229, "learning_rate": 1.9604250641260534e-05, "loss": 0.469, "step": 1963 }, { "epoch": 1.9426310583580615, "grad_norm": 0.30116180498202555, "learning_rate": 1.9585928911689264e-05, "loss": 0.4781, "step": 1964 }, { "epoch": 1.943620178041543, "grad_norm": 0.36835947832919974, "learning_rate": 1.956760718211799e-05, "loss": 0.3505, "step": 1965 }, { "epoch": 1.9446092977250249, "grad_norm": 0.2916022025393715, "learning_rate": 1.954928545254672e-05, "loss": 0.4331, "step": 1966 }, { "epoch": 1.9455984174085064, "grad_norm": 0.24576757883657926, "learning_rate": 1.953096372297545e-05, "loss": 0.3918, "step": 1967 }, { "epoch": 1.9465875370919883, "grad_norm": 0.30131909213793245, "learning_rate": 1.951264199340418e-05, "loss": 0.461, "step": 1968 }, { "epoch": 1.9475766567754698, "grad_norm": 0.3339260654475112, "learning_rate": 1.9494320263832906e-05, "loss": 0.436, "step": 1969 }, { "epoch": 1.9485657764589517, "grad_norm": 0.27524157139256206, "learning_rate": 1.9475998534261636e-05, "loss": 0.4545, "step": 1970 }, { "epoch": 1.9495548961424332, "grad_norm": 0.2874593210645949, "learning_rate": 1.9457676804690363e-05, "loss": 0.4127, "step": 1971 }, { "epoch": 1.950544015825915, "grad_norm": 0.31215326151701583, "learning_rate": 1.9439355075119092e-05, "loss": 0.3706, "step": 1972 }, { "epoch": 1.9515331355093966, "grad_norm": 0.2780337765875761, "learning_rate": 1.9421033345547822e-05, "loss": 0.3934, "step": 1973 }, { "epoch": 1.9525222551928785, "grad_norm": 0.3453992054984105, "learning_rate": 1.940271161597655e-05, "loss": 0.4671, "step": 1974 }, { "epoch": 1.95351137487636, "grad_norm": 0.3321376938331568, "learning_rate": 1.938438988640528e-05, "loss": 0.4228, "step": 1975 }, { "epoch": 1.9545004945598419, "grad_norm": 0.2928057230254324, "learning_rate": 1.9366068156834005e-05, "loss": 0.469, "step": 1976 }, { "epoch": 1.9554896142433233, "grad_norm": 0.30394150855107643, "learning_rate": 1.9347746427262735e-05, "loss": 0.4653, "step": 1977 }, { "epoch": 1.9564787339268053, "grad_norm": 0.2894909046020456, "learning_rate": 1.932942469769146e-05, "loss": 0.3932, "step": 1978 }, { "epoch": 1.9574678536102867, "grad_norm": 0.2711875527085813, "learning_rate": 1.9311102968120194e-05, "loss": 0.3798, "step": 1979 }, { "epoch": 1.9584569732937687, "grad_norm": 0.2725315165989554, "learning_rate": 1.929278123854892e-05, "loss": 0.3549, "step": 1980 }, { "epoch": 1.9594460929772501, "grad_norm": 0.26597443326290904, "learning_rate": 1.927445950897765e-05, "loss": 0.3765, "step": 1981 }, { "epoch": 1.960435212660732, "grad_norm": 0.27613823878296645, "learning_rate": 1.9256137779406377e-05, "loss": 0.3882, "step": 1982 }, { "epoch": 1.9614243323442135, "grad_norm": 0.2851650579144139, "learning_rate": 1.9237816049835107e-05, "loss": 0.4073, "step": 1983 }, { "epoch": 1.9624134520276955, "grad_norm": 0.2855817930760619, "learning_rate": 1.9219494320263833e-05, "loss": 0.4373, "step": 1984 }, { "epoch": 1.963402571711177, "grad_norm": 0.2727240403962694, "learning_rate": 1.9201172590692563e-05, "loss": 0.4231, "step": 1985 }, { "epoch": 1.9643916913946589, "grad_norm": 0.26875071309544885, "learning_rate": 1.918285086112129e-05, "loss": 0.4155, "step": 1986 }, { "epoch": 1.9653808110781403, "grad_norm": 0.24915569601701976, "learning_rate": 1.916452913155002e-05, "loss": 0.3926, "step": 1987 }, { "epoch": 1.9663699307616223, "grad_norm": 1.6073228027788682, "learning_rate": 1.9146207401978746e-05, "loss": 0.4279, "step": 1988 }, { "epoch": 1.9673590504451037, "grad_norm": 0.2792233401337717, "learning_rate": 1.9127885672407475e-05, "loss": 0.4021, "step": 1989 }, { "epoch": 1.9683481701285857, "grad_norm": 0.25494389574541926, "learning_rate": 1.9109563942836202e-05, "loss": 0.4267, "step": 1990 }, { "epoch": 1.9693372898120671, "grad_norm": 0.2898106866186205, "learning_rate": 1.9091242213264935e-05, "loss": 0.4323, "step": 1991 }, { "epoch": 1.970326409495549, "grad_norm": 0.2728511736986319, "learning_rate": 1.907292048369366e-05, "loss": 0.3848, "step": 1992 }, { "epoch": 1.9713155291790305, "grad_norm": 0.24292208041411942, "learning_rate": 1.905459875412239e-05, "loss": 0.4054, "step": 1993 }, { "epoch": 1.9723046488625124, "grad_norm": 0.25921473245660914, "learning_rate": 1.9036277024551118e-05, "loss": 0.393, "step": 1994 }, { "epoch": 1.973293768545994, "grad_norm": 0.27928092261140336, "learning_rate": 1.9017955294979847e-05, "loss": 0.4445, "step": 1995 }, { "epoch": 1.9742828882294758, "grad_norm": 0.27406269020220453, "learning_rate": 1.8999633565408577e-05, "loss": 0.454, "step": 1996 }, { "epoch": 1.9752720079129573, "grad_norm": 0.2614651016454685, "learning_rate": 1.8981311835837304e-05, "loss": 0.3824, "step": 1997 }, { "epoch": 1.9762611275964392, "grad_norm": 0.27737401784240373, "learning_rate": 1.8962990106266033e-05, "loss": 0.4434, "step": 1998 }, { "epoch": 1.9772502472799207, "grad_norm": 0.2604415643950939, "learning_rate": 1.894466837669476e-05, "loss": 0.4093, "step": 1999 }, { "epoch": 1.9782393669634026, "grad_norm": 0.24222620275410053, "learning_rate": 1.892634664712349e-05, "loss": 0.4325, "step": 2000 }, { "epoch": 1.979228486646884, "grad_norm": 0.23224541289636652, "learning_rate": 1.8908024917552216e-05, "loss": 0.3769, "step": 2001 }, { "epoch": 1.980217606330366, "grad_norm": 0.2723348778036034, "learning_rate": 1.8889703187980946e-05, "loss": 0.3764, "step": 2002 }, { "epoch": 1.9812067260138477, "grad_norm": 0.31673784091219626, "learning_rate": 1.8871381458409676e-05, "loss": 0.4378, "step": 2003 }, { "epoch": 1.9821958456973294, "grad_norm": 0.2928301587474028, "learning_rate": 1.8853059728838405e-05, "loss": 0.4119, "step": 2004 }, { "epoch": 1.9831849653808111, "grad_norm": 0.24968201113913796, "learning_rate": 1.8834737999267132e-05, "loss": 0.4273, "step": 2005 }, { "epoch": 1.9841740850642928, "grad_norm": 0.3322683975213439, "learning_rate": 1.8816416269695862e-05, "loss": 0.4857, "step": 2006 }, { "epoch": 1.9851632047477745, "grad_norm": 0.2951583445750594, "learning_rate": 1.8798094540124588e-05, "loss": 0.4307, "step": 2007 }, { "epoch": 1.9861523244312562, "grad_norm": 0.33937998935052427, "learning_rate": 1.8779772810553318e-05, "loss": 0.4664, "step": 2008 }, { "epoch": 1.987141444114738, "grad_norm": 0.2923057755122759, "learning_rate": 1.8761451080982044e-05, "loss": 0.5091, "step": 2009 }, { "epoch": 1.9881305637982196, "grad_norm": 0.2788462746112801, "learning_rate": 1.8743129351410774e-05, "loss": 0.386, "step": 2010 }, { "epoch": 1.9891196834817013, "grad_norm": 0.2516597878121542, "learning_rate": 1.87248076218395e-05, "loss": 0.3403, "step": 2011 }, { "epoch": 1.990108803165183, "grad_norm": 0.30376516700631484, "learning_rate": 1.870648589226823e-05, "loss": 0.4669, "step": 2012 }, { "epoch": 1.9910979228486647, "grad_norm": 0.28097309920042796, "learning_rate": 1.8688164162696957e-05, "loss": 0.4018, "step": 2013 }, { "epoch": 1.9920870425321464, "grad_norm": 0.27334460203651567, "learning_rate": 1.8669842433125687e-05, "loss": 0.4525, "step": 2014 }, { "epoch": 1.993076162215628, "grad_norm": 0.29672124438553915, "learning_rate": 1.8651520703554416e-05, "loss": 0.454, "step": 2015 }, { "epoch": 1.9940652818991098, "grad_norm": 0.2888799406442104, "learning_rate": 1.8633198973983146e-05, "loss": 0.4324, "step": 2016 }, { "epoch": 1.9950544015825915, "grad_norm": 0.25475149445910006, "learning_rate": 1.8614877244411873e-05, "loss": 0.4482, "step": 2017 }, { "epoch": 1.9960435212660732, "grad_norm": 0.29167348715567726, "learning_rate": 1.8596555514840602e-05, "loss": 0.399, "step": 2018 }, { "epoch": 1.997032640949555, "grad_norm": 0.2692693993748144, "learning_rate": 1.8578233785269332e-05, "loss": 0.4011, "step": 2019 }, { "epoch": 1.9980217606330366, "grad_norm": 0.2589196204328078, "learning_rate": 1.855991205569806e-05, "loss": 0.3859, "step": 2020 }, { "epoch": 1.9990108803165183, "grad_norm": 0.6296008364535154, "learning_rate": 1.854159032612679e-05, "loss": 0.4505, "step": 2021 }, { "epoch": 2.0, "grad_norm": 0.3007094377136532, "learning_rate": 1.8523268596555515e-05, "loss": 0.3938, "step": 2022 }, { "epoch": 2.000989119683482, "grad_norm": 0.33376807762869504, "learning_rate": 1.8504946866984245e-05, "loss": 0.362, "step": 2023 }, { "epoch": 2.0019782393669634, "grad_norm": 0.2601375793910521, "learning_rate": 1.848662513741297e-05, "loss": 0.3527, "step": 2024 }, { "epoch": 2.0029673590504453, "grad_norm": 0.2808677816345753, "learning_rate": 1.84683034078417e-05, "loss": 0.3309, "step": 2025 }, { "epoch": 2.003956478733927, "grad_norm": 0.3021748972184023, "learning_rate": 1.8449981678270427e-05, "loss": 0.3377, "step": 2026 }, { "epoch": 2.0049455984174087, "grad_norm": 0.25396738481241404, "learning_rate": 1.843165994869916e-05, "loss": 0.3214, "step": 2027 }, { "epoch": 2.00593471810089, "grad_norm": 0.2522846151099154, "learning_rate": 1.8413338219127887e-05, "loss": 0.33, "step": 2028 }, { "epoch": 2.006923837784372, "grad_norm": 0.2888004385011141, "learning_rate": 1.8395016489556617e-05, "loss": 0.3479, "step": 2029 }, { "epoch": 2.0079129574678536, "grad_norm": 0.2587138992354896, "learning_rate": 1.8376694759985343e-05, "loss": 0.2811, "step": 2030 }, { "epoch": 2.0089020771513355, "grad_norm": 0.3155016932703736, "learning_rate": 1.8358373030414073e-05, "loss": 0.3416, "step": 2031 }, { "epoch": 2.009891196834817, "grad_norm": 0.2857913643432993, "learning_rate": 1.83400513008428e-05, "loss": 0.3571, "step": 2032 }, { "epoch": 2.010880316518299, "grad_norm": 0.2678200050894992, "learning_rate": 1.832172957127153e-05, "loss": 0.3581, "step": 2033 }, { "epoch": 2.0118694362017804, "grad_norm": 0.25644421751373253, "learning_rate": 1.8303407841700256e-05, "loss": 0.313, "step": 2034 }, { "epoch": 2.0128585558852623, "grad_norm": 0.4465486558136206, "learning_rate": 1.8285086112128985e-05, "loss": 0.4098, "step": 2035 }, { "epoch": 2.013847675568744, "grad_norm": 0.2833668117731138, "learning_rate": 1.8266764382557712e-05, "loss": 0.3211, "step": 2036 }, { "epoch": 2.0148367952522257, "grad_norm": 0.23426073423687663, "learning_rate": 1.824844265298644e-05, "loss": 0.3165, "step": 2037 }, { "epoch": 2.015825914935707, "grad_norm": 0.32693965911004064, "learning_rate": 1.823012092341517e-05, "loss": 0.4052, "step": 2038 }, { "epoch": 2.016815034619189, "grad_norm": 0.3380208112798089, "learning_rate": 1.82117991938439e-05, "loss": 0.3321, "step": 2039 }, { "epoch": 2.0178041543026706, "grad_norm": 0.24941143626683626, "learning_rate": 1.8193477464272628e-05, "loss": 0.3336, "step": 2040 }, { "epoch": 2.0187932739861525, "grad_norm": 0.277234094183247, "learning_rate": 1.8175155734701357e-05, "loss": 0.353, "step": 2041 }, { "epoch": 2.019782393669634, "grad_norm": 0.2726781199448609, "learning_rate": 1.8156834005130087e-05, "loss": 0.331, "step": 2042 }, { "epoch": 2.020771513353116, "grad_norm": 0.2617540569925556, "learning_rate": 1.8138512275558814e-05, "loss": 0.3276, "step": 2043 }, { "epoch": 2.0217606330365974, "grad_norm": 0.23504401210710754, "learning_rate": 1.8120190545987544e-05, "loss": 0.3109, "step": 2044 }, { "epoch": 2.0227497527200793, "grad_norm": 0.2407046086285793, "learning_rate": 1.810186881641627e-05, "loss": 0.3374, "step": 2045 }, { "epoch": 2.0237388724035608, "grad_norm": 0.2641964761821478, "learning_rate": 1.8083547086845e-05, "loss": 0.376, "step": 2046 }, { "epoch": 2.0247279920870427, "grad_norm": 0.2882718123838396, "learning_rate": 1.8065225357273726e-05, "loss": 0.3328, "step": 2047 }, { "epoch": 2.025717111770524, "grad_norm": 0.2786807025896711, "learning_rate": 1.8046903627702456e-05, "loss": 0.3459, "step": 2048 }, { "epoch": 2.026706231454006, "grad_norm": 0.2539432049765403, "learning_rate": 1.8028581898131182e-05, "loss": 0.3647, "step": 2049 }, { "epoch": 2.0276953511374876, "grad_norm": 0.2556731150500688, "learning_rate": 1.8010260168559916e-05, "loss": 0.3777, "step": 2050 }, { "epoch": 2.0286844708209695, "grad_norm": 0.25486739832116906, "learning_rate": 1.7991938438988642e-05, "loss": 0.3138, "step": 2051 }, { "epoch": 2.029673590504451, "grad_norm": 0.2774415544605617, "learning_rate": 1.7973616709417372e-05, "loss": 0.3343, "step": 2052 }, { "epoch": 2.030662710187933, "grad_norm": 0.25646240486523647, "learning_rate": 1.7955294979846098e-05, "loss": 0.3676, "step": 2053 }, { "epoch": 2.0316518298714143, "grad_norm": 0.23627554159821249, "learning_rate": 1.7936973250274828e-05, "loss": 0.297, "step": 2054 }, { "epoch": 2.0326409495548963, "grad_norm": 0.9750055399356007, "learning_rate": 1.7918651520703554e-05, "loss": 0.3385, "step": 2055 }, { "epoch": 2.0336300692383777, "grad_norm": 0.3259195566861495, "learning_rate": 1.7900329791132284e-05, "loss": 0.3283, "step": 2056 }, { "epoch": 2.0346191889218597, "grad_norm": 0.34994469632467384, "learning_rate": 1.788200806156101e-05, "loss": 0.3335, "step": 2057 }, { "epoch": 2.035608308605341, "grad_norm": 0.3874809670228957, "learning_rate": 1.786368633198974e-05, "loss": 0.3423, "step": 2058 }, { "epoch": 2.036597428288823, "grad_norm": 0.4397999002338133, "learning_rate": 1.7845364602418467e-05, "loss": 0.3911, "step": 2059 }, { "epoch": 2.0375865479723045, "grad_norm": 0.2965068166961372, "learning_rate": 1.7827042872847197e-05, "loss": 0.368, "step": 2060 }, { "epoch": 2.0385756676557865, "grad_norm": 0.33913465022866834, "learning_rate": 1.7808721143275926e-05, "loss": 0.3485, "step": 2061 }, { "epoch": 2.039564787339268, "grad_norm": 0.41953455881087176, "learning_rate": 1.7790399413704656e-05, "loss": 0.3337, "step": 2062 }, { "epoch": 2.04055390702275, "grad_norm": 0.24963906693283444, "learning_rate": 1.7772077684133383e-05, "loss": 0.3388, "step": 2063 }, { "epoch": 2.0415430267062313, "grad_norm": 0.24427178402898375, "learning_rate": 1.7753755954562113e-05, "loss": 0.3758, "step": 2064 }, { "epoch": 2.0425321463897133, "grad_norm": 0.3391522202251561, "learning_rate": 1.773543422499084e-05, "loss": 0.3954, "step": 2065 }, { "epoch": 2.0435212660731947, "grad_norm": 0.305105117952119, "learning_rate": 1.771711249541957e-05, "loss": 0.3189, "step": 2066 }, { "epoch": 2.0445103857566767, "grad_norm": 0.2872800239846866, "learning_rate": 1.76987907658483e-05, "loss": 0.3776, "step": 2067 }, { "epoch": 2.045499505440158, "grad_norm": 0.2504019307251769, "learning_rate": 1.7680469036277025e-05, "loss": 0.356, "step": 2068 }, { "epoch": 2.04648862512364, "grad_norm": 0.25901454727456, "learning_rate": 1.7662147306705755e-05, "loss": 0.3776, "step": 2069 }, { "epoch": 2.0474777448071215, "grad_norm": 0.29939939557269285, "learning_rate": 1.764382557713448e-05, "loss": 0.3126, "step": 2070 }, { "epoch": 2.0484668644906034, "grad_norm": 0.3231004145272702, "learning_rate": 1.762550384756321e-05, "loss": 0.3393, "step": 2071 }, { "epoch": 2.049455984174085, "grad_norm": 0.2476325861585833, "learning_rate": 1.7607182117991937e-05, "loss": 0.3463, "step": 2072 }, { "epoch": 2.050445103857567, "grad_norm": 0.2979710367960843, "learning_rate": 1.7588860388420667e-05, "loss": 0.3726, "step": 2073 }, { "epoch": 2.0514342235410483, "grad_norm": 0.2762787234379862, "learning_rate": 1.7570538658849397e-05, "loss": 0.3372, "step": 2074 }, { "epoch": 2.0524233432245302, "grad_norm": 0.2525936147732162, "learning_rate": 1.7552216929278127e-05, "loss": 0.3224, "step": 2075 }, { "epoch": 2.0534124629080117, "grad_norm": 0.2584635923287862, "learning_rate": 1.7533895199706853e-05, "loss": 0.3592, "step": 2076 }, { "epoch": 2.0544015825914936, "grad_norm": 0.24758511782034934, "learning_rate": 1.7515573470135583e-05, "loss": 0.3228, "step": 2077 }, { "epoch": 2.055390702274975, "grad_norm": 0.2883921683823937, "learning_rate": 1.749725174056431e-05, "loss": 0.3278, "step": 2078 }, { "epoch": 2.056379821958457, "grad_norm": 0.3758824821850759, "learning_rate": 1.747893001099304e-05, "loss": 0.3069, "step": 2079 }, { "epoch": 2.0573689416419385, "grad_norm": 0.26898488904546836, "learning_rate": 1.7460608281421766e-05, "loss": 0.3374, "step": 2080 }, { "epoch": 2.0583580613254204, "grad_norm": 0.22348922505676247, "learning_rate": 1.7442286551850495e-05, "loss": 0.297, "step": 2081 }, { "epoch": 2.059347181008902, "grad_norm": 0.2431620389642761, "learning_rate": 1.7423964822279222e-05, "loss": 0.3452, "step": 2082 }, { "epoch": 2.060336300692384, "grad_norm": 0.2715649731140133, "learning_rate": 1.7405643092707952e-05, "loss": 0.3588, "step": 2083 }, { "epoch": 2.0613254203758653, "grad_norm": 0.24583847764861538, "learning_rate": 1.738732136313668e-05, "loss": 0.3515, "step": 2084 }, { "epoch": 2.0623145400593472, "grad_norm": 6.147132786755756, "learning_rate": 1.7368999633565408e-05, "loss": 0.6556, "step": 2085 }, { "epoch": 2.0633036597428287, "grad_norm": 0.2739117127003737, "learning_rate": 1.7350677903994138e-05, "loss": 0.4016, "step": 2086 }, { "epoch": 2.0642927794263106, "grad_norm": 0.2735119068979336, "learning_rate": 1.7332356174422868e-05, "loss": 0.3623, "step": 2087 }, { "epoch": 2.065281899109792, "grad_norm": 0.2384520866225091, "learning_rate": 1.7314034444851594e-05, "loss": 0.3101, "step": 2088 }, { "epoch": 2.066271018793274, "grad_norm": 0.2396852378862046, "learning_rate": 1.7295712715280324e-05, "loss": 0.3292, "step": 2089 }, { "epoch": 2.0672601384767555, "grad_norm": 0.9802498145057038, "learning_rate": 1.7277390985709054e-05, "loss": 0.4025, "step": 2090 }, { "epoch": 2.0682492581602374, "grad_norm": 0.26601631784836316, "learning_rate": 1.725906925613778e-05, "loss": 0.3529, "step": 2091 }, { "epoch": 2.069238377843719, "grad_norm": 0.2914785211803027, "learning_rate": 1.724074752656651e-05, "loss": 0.3318, "step": 2092 }, { "epoch": 2.070227497527201, "grad_norm": 0.27346245434038796, "learning_rate": 1.7222425796995236e-05, "loss": 0.3645, "step": 2093 }, { "epoch": 2.0712166172106823, "grad_norm": 0.24874170019953046, "learning_rate": 1.7204104067423966e-05, "loss": 0.3408, "step": 2094 }, { "epoch": 2.072205736894164, "grad_norm": 0.297906315031933, "learning_rate": 1.7185782337852692e-05, "loss": 0.3444, "step": 2095 }, { "epoch": 2.0731948565776457, "grad_norm": 0.2744309054019997, "learning_rate": 1.7167460608281422e-05, "loss": 0.3133, "step": 2096 }, { "epoch": 2.0741839762611276, "grad_norm": 0.24571474785996675, "learning_rate": 1.714913887871015e-05, "loss": 0.3276, "step": 2097 }, { "epoch": 2.075173095944609, "grad_norm": 0.26936390193781384, "learning_rate": 1.7130817149138882e-05, "loss": 0.3773, "step": 2098 }, { "epoch": 2.076162215628091, "grad_norm": 0.27132705449446876, "learning_rate": 1.7112495419567608e-05, "loss": 0.3354, "step": 2099 }, { "epoch": 2.077151335311573, "grad_norm": 0.26429280776805036, "learning_rate": 1.7094173689996338e-05, "loss": 0.3582, "step": 2100 }, { "epoch": 2.0781404549950544, "grad_norm": 0.2312406462125731, "learning_rate": 1.7075851960425065e-05, "loss": 0.3097, "step": 2101 }, { "epoch": 2.079129574678536, "grad_norm": 0.24469123822007616, "learning_rate": 1.7057530230853794e-05, "loss": 0.3485, "step": 2102 }, { "epoch": 2.080118694362018, "grad_norm": 0.2319289399183783, "learning_rate": 1.703920850128252e-05, "loss": 0.3115, "step": 2103 }, { "epoch": 2.0811078140454997, "grad_norm": 0.2528130883133997, "learning_rate": 1.702088677171125e-05, "loss": 0.3154, "step": 2104 }, { "epoch": 2.082096933728981, "grad_norm": 0.2530788330356479, "learning_rate": 1.7002565042139977e-05, "loss": 0.3571, "step": 2105 }, { "epoch": 2.083086053412463, "grad_norm": 0.25288858939964987, "learning_rate": 1.6984243312568707e-05, "loss": 0.3261, "step": 2106 }, { "epoch": 2.0840751730959446, "grad_norm": 0.2573111175053986, "learning_rate": 1.6965921582997437e-05, "loss": 0.3312, "step": 2107 }, { "epoch": 2.0850642927794265, "grad_norm": 0.24283051150823262, "learning_rate": 1.6947599853426163e-05, "loss": 0.331, "step": 2108 }, { "epoch": 2.086053412462908, "grad_norm": 0.7752505829926737, "learning_rate": 1.6929278123854893e-05, "loss": 0.343, "step": 2109 }, { "epoch": 2.08704253214639, "grad_norm": 0.29407470466934255, "learning_rate": 1.6910956394283623e-05, "loss": 0.3481, "step": 2110 }, { "epoch": 2.0880316518298714, "grad_norm": 0.24568189642774338, "learning_rate": 1.689263466471235e-05, "loss": 0.3132, "step": 2111 }, { "epoch": 2.0890207715133533, "grad_norm": 0.23470405276109677, "learning_rate": 1.687431293514108e-05, "loss": 0.305, "step": 2112 }, { "epoch": 2.090009891196835, "grad_norm": 0.26268923309496633, "learning_rate": 1.685599120556981e-05, "loss": 0.2962, "step": 2113 }, { "epoch": 2.0909990108803167, "grad_norm": 0.26191345194398796, "learning_rate": 1.6837669475998535e-05, "loss": 0.3463, "step": 2114 }, { "epoch": 2.091988130563798, "grad_norm": 1.0669555775783601, "learning_rate": 1.6819347746427265e-05, "loss": 0.3839, "step": 2115 }, { "epoch": 2.09297725024728, "grad_norm": 0.2443401607557005, "learning_rate": 1.680102601685599e-05, "loss": 0.3496, "step": 2116 }, { "epoch": 2.0939663699307616, "grad_norm": 0.2788783097347, "learning_rate": 1.678270428728472e-05, "loss": 0.3646, "step": 2117 }, { "epoch": 2.0949554896142435, "grad_norm": 0.2808482996134958, "learning_rate": 1.6764382557713447e-05, "loss": 0.3222, "step": 2118 }, { "epoch": 2.095944609297725, "grad_norm": 0.27568916240187386, "learning_rate": 1.6746060828142177e-05, "loss": 0.3844, "step": 2119 }, { "epoch": 2.096933728981207, "grad_norm": 0.2581105035106014, "learning_rate": 1.6727739098570904e-05, "loss": 0.3115, "step": 2120 }, { "epoch": 2.0979228486646884, "grad_norm": 0.2541721070612834, "learning_rate": 1.6709417368999634e-05, "loss": 0.3393, "step": 2121 }, { "epoch": 2.0989119683481703, "grad_norm": 0.23819664141370792, "learning_rate": 1.6691095639428363e-05, "loss": 0.3133, "step": 2122 }, { "epoch": 2.0999010880316518, "grad_norm": 0.2557320947503553, "learning_rate": 1.6672773909857093e-05, "loss": 0.3477, "step": 2123 }, { "epoch": 2.1008902077151337, "grad_norm": 0.2271503017475219, "learning_rate": 1.665445218028582e-05, "loss": 0.324, "step": 2124 }, { "epoch": 2.101879327398615, "grad_norm": 0.2863216334585617, "learning_rate": 1.663613045071455e-05, "loss": 0.3486, "step": 2125 }, { "epoch": 2.102868447082097, "grad_norm": 0.2855347267238924, "learning_rate": 1.6617808721143276e-05, "loss": 0.3575, "step": 2126 }, { "epoch": 2.1038575667655786, "grad_norm": 0.23052832309475305, "learning_rate": 1.6599486991572006e-05, "loss": 0.3211, "step": 2127 }, { "epoch": 2.1048466864490605, "grad_norm": 0.24044171148061522, "learning_rate": 1.6581165262000732e-05, "loss": 0.3517, "step": 2128 }, { "epoch": 2.105835806132542, "grad_norm": 0.26424268988397426, "learning_rate": 1.6562843532429462e-05, "loss": 0.3354, "step": 2129 }, { "epoch": 2.106824925816024, "grad_norm": 0.32985801683730653, "learning_rate": 1.654452180285819e-05, "loss": 0.3937, "step": 2130 }, { "epoch": 2.1078140454995054, "grad_norm": 0.2851148291510856, "learning_rate": 1.6526200073286918e-05, "loss": 0.369, "step": 2131 }, { "epoch": 2.1088031651829873, "grad_norm": 0.2631999833247007, "learning_rate": 1.6507878343715648e-05, "loss": 0.3141, "step": 2132 }, { "epoch": 2.1097922848664687, "grad_norm": 0.27216770136236385, "learning_rate": 1.6489556614144374e-05, "loss": 0.3707, "step": 2133 }, { "epoch": 2.1107814045499507, "grad_norm": 0.26372284319992984, "learning_rate": 1.6471234884573104e-05, "loss": 0.3432, "step": 2134 }, { "epoch": 2.111770524233432, "grad_norm": 2.7801144799732995, "learning_rate": 1.6452913155001834e-05, "loss": 0.4697, "step": 2135 }, { "epoch": 2.112759643916914, "grad_norm": 0.3911000168082124, "learning_rate": 1.6434591425430564e-05, "loss": 0.3456, "step": 2136 }, { "epoch": 2.1137487636003955, "grad_norm": 0.39596832117969594, "learning_rate": 1.641626969585929e-05, "loss": 0.2952, "step": 2137 }, { "epoch": 2.1147378832838775, "grad_norm": 0.29291151380810526, "learning_rate": 1.639794796628802e-05, "loss": 0.3477, "step": 2138 }, { "epoch": 2.115727002967359, "grad_norm": 0.2601124431784706, "learning_rate": 1.6379626236716746e-05, "loss": 0.4013, "step": 2139 }, { "epoch": 2.116716122650841, "grad_norm": 0.3076184020471208, "learning_rate": 1.6361304507145476e-05, "loss": 0.3481, "step": 2140 }, { "epoch": 2.1177052423343223, "grad_norm": 0.31115479146294667, "learning_rate": 1.6342982777574203e-05, "loss": 0.3502, "step": 2141 }, { "epoch": 2.1186943620178043, "grad_norm": 0.2729157234349751, "learning_rate": 1.6324661048002932e-05, "loss": 0.3569, "step": 2142 }, { "epoch": 2.1196834817012857, "grad_norm": 0.2522458263760746, "learning_rate": 1.630633931843166e-05, "loss": 0.3964, "step": 2143 }, { "epoch": 2.1206726013847677, "grad_norm": 0.26441364119229976, "learning_rate": 1.628801758886039e-05, "loss": 0.3243, "step": 2144 }, { "epoch": 2.121661721068249, "grad_norm": 0.26478323771121154, "learning_rate": 1.6269695859289115e-05, "loss": 0.3417, "step": 2145 }, { "epoch": 2.122650840751731, "grad_norm": 5.200418924621773, "learning_rate": 1.6251374129717848e-05, "loss": 0.9442, "step": 2146 }, { "epoch": 2.1236399604352125, "grad_norm": 0.26631129417116767, "learning_rate": 1.6233052400146575e-05, "loss": 0.3734, "step": 2147 }, { "epoch": 2.1246290801186944, "grad_norm": 0.2397373984722219, "learning_rate": 1.6214730670575304e-05, "loss": 0.3142, "step": 2148 }, { "epoch": 2.125618199802176, "grad_norm": 0.24875529064629065, "learning_rate": 1.619640894100403e-05, "loss": 0.3487, "step": 2149 }, { "epoch": 2.126607319485658, "grad_norm": 0.265519222625967, "learning_rate": 1.617808721143276e-05, "loss": 0.319, "step": 2150 }, { "epoch": 2.1275964391691393, "grad_norm": 0.2842533650962704, "learning_rate": 1.6159765481861487e-05, "loss": 0.3824, "step": 2151 }, { "epoch": 2.1285855588526212, "grad_norm": 0.22298739828635666, "learning_rate": 1.6141443752290217e-05, "loss": 0.2756, "step": 2152 }, { "epoch": 2.1295746785361027, "grad_norm": 0.25735506163209704, "learning_rate": 1.6123122022718947e-05, "loss": 0.3611, "step": 2153 }, { "epoch": 2.1305637982195846, "grad_norm": 0.2818421075725635, "learning_rate": 1.6104800293147673e-05, "loss": 0.3923, "step": 2154 }, { "epoch": 2.131552917903066, "grad_norm": 0.2754159452458974, "learning_rate": 1.6086478563576403e-05, "loss": 0.3704, "step": 2155 }, { "epoch": 2.132542037586548, "grad_norm": 0.23952536288599155, "learning_rate": 1.606815683400513e-05, "loss": 0.318, "step": 2156 }, { "epoch": 2.1335311572700295, "grad_norm": 0.25008759238683226, "learning_rate": 1.604983510443386e-05, "loss": 0.3357, "step": 2157 }, { "epoch": 2.1345202769535114, "grad_norm": 0.27461726260777686, "learning_rate": 1.603151337486259e-05, "loss": 0.3535, "step": 2158 }, { "epoch": 2.135509396636993, "grad_norm": 0.27788201287671715, "learning_rate": 1.601319164529132e-05, "loss": 0.3793, "step": 2159 }, { "epoch": 2.136498516320475, "grad_norm": 0.26150100655181496, "learning_rate": 1.5994869915720045e-05, "loss": 0.3318, "step": 2160 }, { "epoch": 2.1374876360039563, "grad_norm": 0.277747766506713, "learning_rate": 1.5976548186148775e-05, "loss": 0.356, "step": 2161 }, { "epoch": 2.1384767556874382, "grad_norm": 0.25722497139475575, "learning_rate": 1.59582264565775e-05, "loss": 0.3627, "step": 2162 }, { "epoch": 2.1394658753709197, "grad_norm": 0.23571960846153608, "learning_rate": 1.593990472700623e-05, "loss": 0.3348, "step": 2163 }, { "epoch": 2.1404549950544016, "grad_norm": 0.2968932627364013, "learning_rate": 1.5921582997434958e-05, "loss": 0.3604, "step": 2164 }, { "epoch": 2.141444114737883, "grad_norm": 0.2755392915657586, "learning_rate": 1.5903261267863687e-05, "loss": 0.3541, "step": 2165 }, { "epoch": 2.142433234421365, "grad_norm": 0.23899316297498144, "learning_rate": 1.5884939538292414e-05, "loss": 0.3257, "step": 2166 }, { "epoch": 2.1434223541048465, "grad_norm": 0.25067453063367884, "learning_rate": 1.5866617808721144e-05, "loss": 0.3296, "step": 2167 }, { "epoch": 2.1444114737883284, "grad_norm": 0.24598510268818005, "learning_rate": 1.584829607914987e-05, "loss": 0.3175, "step": 2168 }, { "epoch": 2.14540059347181, "grad_norm": 0.24842567923116413, "learning_rate": 1.58299743495786e-05, "loss": 0.3386, "step": 2169 }, { "epoch": 2.146389713155292, "grad_norm": 0.279977258093171, "learning_rate": 1.581165262000733e-05, "loss": 0.3838, "step": 2170 }, { "epoch": 2.1473788328387733, "grad_norm": 0.30234421674111595, "learning_rate": 1.579333089043606e-05, "loss": 0.3861, "step": 2171 }, { "epoch": 2.148367952522255, "grad_norm": 0.23537629559910506, "learning_rate": 1.5775009160864786e-05, "loss": 0.329, "step": 2172 }, { "epoch": 2.1493570722057367, "grad_norm": 0.27012138733219354, "learning_rate": 1.5756687431293516e-05, "loss": 0.3325, "step": 2173 }, { "epoch": 2.1503461918892186, "grad_norm": 0.24908855134556376, "learning_rate": 1.5738365701722242e-05, "loss": 0.3055, "step": 2174 }, { "epoch": 2.1513353115727005, "grad_norm": 0.22465760736037585, "learning_rate": 1.5720043972150972e-05, "loss": 0.3314, "step": 2175 }, { "epoch": 2.152324431256182, "grad_norm": 0.24248671493308285, "learning_rate": 1.57017222425797e-05, "loss": 0.3247, "step": 2176 }, { "epoch": 2.1533135509396635, "grad_norm": 0.22953998434558445, "learning_rate": 1.5683400513008428e-05, "loss": 0.2973, "step": 2177 }, { "epoch": 2.1543026706231454, "grad_norm": 0.22595938467456078, "learning_rate": 1.5665078783437158e-05, "loss": 0.3005, "step": 2178 }, { "epoch": 2.1552917903066273, "grad_norm": 0.2506828155398757, "learning_rate": 1.5646757053865884e-05, "loss": 0.3772, "step": 2179 }, { "epoch": 2.156280909990109, "grad_norm": 0.25791925829552964, "learning_rate": 1.5628435324294614e-05, "loss": 0.3562, "step": 2180 }, { "epoch": 2.1572700296735903, "grad_norm": 0.24209818620770746, "learning_rate": 1.5610113594723344e-05, "loss": 0.2983, "step": 2181 }, { "epoch": 2.158259149357072, "grad_norm": 0.23728500789942555, "learning_rate": 1.5591791865152074e-05, "loss": 0.3153, "step": 2182 }, { "epoch": 2.159248269040554, "grad_norm": 0.26976125221293046, "learning_rate": 1.55734701355808e-05, "loss": 0.3283, "step": 2183 }, { "epoch": 2.1602373887240356, "grad_norm": 0.2401119717618975, "learning_rate": 1.555514840600953e-05, "loss": 0.3377, "step": 2184 }, { "epoch": 2.1612265084075175, "grad_norm": 0.2321954749389243, "learning_rate": 1.5536826676438256e-05, "loss": 0.3271, "step": 2185 }, { "epoch": 2.162215628090999, "grad_norm": 0.23834291539778332, "learning_rate": 1.5518504946866986e-05, "loss": 0.3227, "step": 2186 }, { "epoch": 2.163204747774481, "grad_norm": 0.3017365420246128, "learning_rate": 1.5500183217295713e-05, "loss": 0.3912, "step": 2187 }, { "epoch": 2.1641938674579624, "grad_norm": 0.25939756442962536, "learning_rate": 1.5481861487724442e-05, "loss": 0.3645, "step": 2188 }, { "epoch": 2.1651829871414443, "grad_norm": 0.2671540694432964, "learning_rate": 1.546353975815317e-05, "loss": 0.3464, "step": 2189 }, { "epoch": 2.166172106824926, "grad_norm": 0.24114920876599572, "learning_rate": 1.54452180285819e-05, "loss": 0.3318, "step": 2190 }, { "epoch": 2.1671612265084077, "grad_norm": 0.23250896713005623, "learning_rate": 1.5426896299010625e-05, "loss": 0.3018, "step": 2191 }, { "epoch": 2.168150346191889, "grad_norm": 0.25900429796077135, "learning_rate": 1.5408574569439355e-05, "loss": 0.3794, "step": 2192 }, { "epoch": 2.169139465875371, "grad_norm": 0.2534934876573256, "learning_rate": 1.5390252839868085e-05, "loss": 0.3412, "step": 2193 }, { "epoch": 2.1701285855588526, "grad_norm": 0.3237099924662489, "learning_rate": 1.5371931110296814e-05, "loss": 0.3876, "step": 2194 }, { "epoch": 2.1711177052423345, "grad_norm": 0.2263325734121825, "learning_rate": 1.535360938072554e-05, "loss": 0.3421, "step": 2195 }, { "epoch": 2.172106824925816, "grad_norm": 0.27015210617952656, "learning_rate": 1.533528765115427e-05, "loss": 0.325, "step": 2196 }, { "epoch": 2.173095944609298, "grad_norm": 0.24041857827848195, "learning_rate": 1.5316965921582997e-05, "loss": 0.3352, "step": 2197 }, { "epoch": 2.1740850642927794, "grad_norm": 0.2303276894642009, "learning_rate": 1.5298644192011727e-05, "loss": 0.3082, "step": 2198 }, { "epoch": 2.1750741839762613, "grad_norm": 0.22582439744525307, "learning_rate": 1.5280322462440457e-05, "loss": 0.3294, "step": 2199 }, { "epoch": 2.1760633036597428, "grad_norm": 0.24683399509790616, "learning_rate": 1.5262000732869183e-05, "loss": 0.3577, "step": 2200 }, { "epoch": 2.1770524233432247, "grad_norm": 0.2370261131813352, "learning_rate": 1.5243679003297911e-05, "loss": 0.3411, "step": 2201 }, { "epoch": 2.178041543026706, "grad_norm": 0.24781478654496875, "learning_rate": 1.522535727372664e-05, "loss": 0.3316, "step": 2202 }, { "epoch": 2.179030662710188, "grad_norm": 0.23239061025797889, "learning_rate": 1.5207035544155367e-05, "loss": 0.3263, "step": 2203 }, { "epoch": 2.1800197823936696, "grad_norm": 0.2232722530248141, "learning_rate": 1.5188713814584096e-05, "loss": 0.3081, "step": 2204 }, { "epoch": 2.1810089020771515, "grad_norm": 0.2594820299682084, "learning_rate": 1.5170392085012827e-05, "loss": 0.3653, "step": 2205 }, { "epoch": 2.181998021760633, "grad_norm": 0.24820776329147887, "learning_rate": 1.5152070355441555e-05, "loss": 0.3115, "step": 2206 }, { "epoch": 2.182987141444115, "grad_norm": 0.4308536837119438, "learning_rate": 1.5133748625870283e-05, "loss": 0.3583, "step": 2207 }, { "epoch": 2.1839762611275964, "grad_norm": 0.2447733062324265, "learning_rate": 1.5115426896299011e-05, "loss": 0.3242, "step": 2208 }, { "epoch": 2.1849653808110783, "grad_norm": 0.255212859520126, "learning_rate": 1.509710516672774e-05, "loss": 0.3407, "step": 2209 }, { "epoch": 2.1859545004945597, "grad_norm": 0.24780445254507258, "learning_rate": 1.5078783437156468e-05, "loss": 0.3777, "step": 2210 }, { "epoch": 2.1869436201780417, "grad_norm": 0.22667950198277595, "learning_rate": 1.5060461707585197e-05, "loss": 0.322, "step": 2211 }, { "epoch": 2.187932739861523, "grad_norm": 0.2736208075026148, "learning_rate": 1.5042139978013926e-05, "loss": 0.3435, "step": 2212 }, { "epoch": 2.188921859545005, "grad_norm": 0.25076062434160956, "learning_rate": 1.5023818248442654e-05, "loss": 0.3341, "step": 2213 }, { "epoch": 2.1899109792284865, "grad_norm": 0.2516337328032655, "learning_rate": 1.5005496518871382e-05, "loss": 0.3767, "step": 2214 }, { "epoch": 2.1909000989119685, "grad_norm": 0.24007131228024153, "learning_rate": 1.498717478930011e-05, "loss": 0.3304, "step": 2215 }, { "epoch": 2.19188921859545, "grad_norm": 0.2653627206351916, "learning_rate": 1.4968853059728838e-05, "loss": 0.3581, "step": 2216 }, { "epoch": 2.192878338278932, "grad_norm": 0.23740704954394445, "learning_rate": 1.495053133015757e-05, "loss": 0.3322, "step": 2217 }, { "epoch": 2.1938674579624133, "grad_norm": 0.26183283644070443, "learning_rate": 1.4932209600586298e-05, "loss": 0.3058, "step": 2218 }, { "epoch": 2.1948565776458953, "grad_norm": 0.25535363753862483, "learning_rate": 1.4913887871015026e-05, "loss": 0.3867, "step": 2219 }, { "epoch": 2.1958456973293767, "grad_norm": 0.2410531896962311, "learning_rate": 1.4895566141443754e-05, "loss": 0.3287, "step": 2220 }, { "epoch": 2.1968348170128587, "grad_norm": 0.23864241175081596, "learning_rate": 1.4877244411872482e-05, "loss": 0.2957, "step": 2221 }, { "epoch": 2.19782393669634, "grad_norm": 0.23152858294826056, "learning_rate": 1.485892268230121e-05, "loss": 0.3267, "step": 2222 }, { "epoch": 2.198813056379822, "grad_norm": 0.25038544912526106, "learning_rate": 1.4840600952729938e-05, "loss": 0.3727, "step": 2223 }, { "epoch": 2.1998021760633035, "grad_norm": 0.24575360880677702, "learning_rate": 1.4822279223158666e-05, "loss": 0.3204, "step": 2224 }, { "epoch": 2.2007912957467854, "grad_norm": 0.2617185419544845, "learning_rate": 1.4803957493587394e-05, "loss": 0.3344, "step": 2225 }, { "epoch": 2.201780415430267, "grad_norm": 0.23621554765095076, "learning_rate": 1.4785635764016123e-05, "loss": 0.3194, "step": 2226 }, { "epoch": 2.202769535113749, "grad_norm": 0.2656714021948276, "learning_rate": 1.476731403444485e-05, "loss": 0.3251, "step": 2227 }, { "epoch": 2.2037586547972303, "grad_norm": 0.2352678555408159, "learning_rate": 1.4748992304873579e-05, "loss": 0.3474, "step": 2228 }, { "epoch": 2.2047477744807122, "grad_norm": 0.23365121088195945, "learning_rate": 1.473067057530231e-05, "loss": 0.3214, "step": 2229 }, { "epoch": 2.2057368941641937, "grad_norm": 0.2455996352228685, "learning_rate": 1.4712348845731038e-05, "loss": 0.4093, "step": 2230 }, { "epoch": 2.2067260138476756, "grad_norm": 0.2740975848282929, "learning_rate": 1.4694027116159766e-05, "loss": 0.3399, "step": 2231 }, { "epoch": 2.207715133531157, "grad_norm": 0.2598627296070607, "learning_rate": 1.4675705386588495e-05, "loss": 0.3083, "step": 2232 }, { "epoch": 2.208704253214639, "grad_norm": 0.23897597759120234, "learning_rate": 1.4657383657017223e-05, "loss": 0.3551, "step": 2233 }, { "epoch": 2.2096933728981205, "grad_norm": 0.28717030628471557, "learning_rate": 1.4639061927445952e-05, "loss": 0.36, "step": 2234 }, { "epoch": 2.2106824925816024, "grad_norm": 0.2551734546245449, "learning_rate": 1.462074019787468e-05, "loss": 0.2981, "step": 2235 }, { "epoch": 2.211671612265084, "grad_norm": 0.22660711986812593, "learning_rate": 1.4602418468303409e-05, "loss": 0.3241, "step": 2236 }, { "epoch": 2.212660731948566, "grad_norm": 0.22072364142005796, "learning_rate": 1.4584096738732137e-05, "loss": 0.3397, "step": 2237 }, { "epoch": 2.2136498516320473, "grad_norm": 0.25077102419811953, "learning_rate": 1.4565775009160865e-05, "loss": 0.3007, "step": 2238 }, { "epoch": 2.2146389713155292, "grad_norm": 0.23275953912915226, "learning_rate": 1.4547453279589593e-05, "loss": 0.3301, "step": 2239 }, { "epoch": 2.2156280909990107, "grad_norm": 0.2552844764105821, "learning_rate": 1.4529131550018321e-05, "loss": 0.3359, "step": 2240 }, { "epoch": 2.2166172106824926, "grad_norm": 0.2209011298450527, "learning_rate": 1.4510809820447053e-05, "loss": 0.3598, "step": 2241 }, { "epoch": 2.217606330365974, "grad_norm": 1.480125764071416, "learning_rate": 1.449248809087578e-05, "loss": 0.3688, "step": 2242 }, { "epoch": 2.218595450049456, "grad_norm": 0.30985712503457974, "learning_rate": 1.4474166361304509e-05, "loss": 0.3408, "step": 2243 }, { "epoch": 2.2195845697329375, "grad_norm": 0.24892275955536927, "learning_rate": 1.4455844631733237e-05, "loss": 0.3085, "step": 2244 }, { "epoch": 2.2205736894164194, "grad_norm": 0.24101322361416325, "learning_rate": 1.4437522902161965e-05, "loss": 0.3437, "step": 2245 }, { "epoch": 2.221562809099901, "grad_norm": 0.2910916695987308, "learning_rate": 1.4419201172590693e-05, "loss": 0.342, "step": 2246 }, { "epoch": 2.222551928783383, "grad_norm": 0.2761723063225733, "learning_rate": 1.4400879443019421e-05, "loss": 0.3252, "step": 2247 }, { "epoch": 2.2235410484668643, "grad_norm": 0.23723078949172213, "learning_rate": 1.438255771344815e-05, "loss": 0.3535, "step": 2248 }, { "epoch": 2.224530168150346, "grad_norm": 0.23083729839007824, "learning_rate": 1.4364235983876878e-05, "loss": 0.3462, "step": 2249 }, { "epoch": 2.2255192878338277, "grad_norm": 0.24923526336688007, "learning_rate": 1.4345914254305606e-05, "loss": 0.3479, "step": 2250 }, { "epoch": 2.2265084075173096, "grad_norm": 0.2365620936428639, "learning_rate": 1.4327592524734334e-05, "loss": 0.3341, "step": 2251 }, { "epoch": 2.227497527200791, "grad_norm": 0.26201081319476877, "learning_rate": 1.4309270795163064e-05, "loss": 0.3605, "step": 2252 }, { "epoch": 2.228486646884273, "grad_norm": 0.23329839147411013, "learning_rate": 1.4290949065591793e-05, "loss": 0.3275, "step": 2253 }, { "epoch": 2.229475766567755, "grad_norm": 0.23449570191396432, "learning_rate": 1.4272627336020521e-05, "loss": 0.3344, "step": 2254 }, { "epoch": 2.2304648862512364, "grad_norm": 0.25077592463879617, "learning_rate": 1.425430560644925e-05, "loss": 0.3158, "step": 2255 }, { "epoch": 2.231454005934718, "grad_norm": 0.2638212095862189, "learning_rate": 1.4235983876877978e-05, "loss": 0.3905, "step": 2256 }, { "epoch": 2.2324431256182, "grad_norm": 0.2281528691605368, "learning_rate": 1.4217662147306708e-05, "loss": 0.3068, "step": 2257 }, { "epoch": 2.2334322453016817, "grad_norm": 0.24868299339572644, "learning_rate": 1.4199340417735436e-05, "loss": 0.3352, "step": 2258 }, { "epoch": 2.234421364985163, "grad_norm": 0.25226182191275837, "learning_rate": 1.4181018688164164e-05, "loss": 0.3799, "step": 2259 }, { "epoch": 2.2354104846686447, "grad_norm": 0.23719055969275504, "learning_rate": 1.4162696958592892e-05, "loss": 0.285, "step": 2260 }, { "epoch": 2.2363996043521266, "grad_norm": 0.22645086719819674, "learning_rate": 1.414437522902162e-05, "loss": 0.3535, "step": 2261 }, { "epoch": 2.2373887240356085, "grad_norm": 0.2378652475705354, "learning_rate": 1.4126053499450348e-05, "loss": 0.322, "step": 2262 }, { "epoch": 2.23837784371909, "grad_norm": 0.2446078244591778, "learning_rate": 1.4107731769879076e-05, "loss": 0.3008, "step": 2263 }, { "epoch": 2.239366963402572, "grad_norm": 0.23473892675122612, "learning_rate": 1.4089410040307804e-05, "loss": 0.3174, "step": 2264 }, { "epoch": 2.2403560830860534, "grad_norm": 0.2740082445935662, "learning_rate": 1.4071088310736536e-05, "loss": 0.3673, "step": 2265 }, { "epoch": 2.2413452027695353, "grad_norm": 0.3047335384705569, "learning_rate": 1.4052766581165264e-05, "loss": 0.3862, "step": 2266 }, { "epoch": 2.242334322453017, "grad_norm": 0.22953115502891203, "learning_rate": 1.4034444851593992e-05, "loss": 0.3544, "step": 2267 }, { "epoch": 2.2433234421364987, "grad_norm": 0.2381048121745753, "learning_rate": 1.401612312202272e-05, "loss": 0.385, "step": 2268 }, { "epoch": 2.24431256181998, "grad_norm": 0.22695002295754346, "learning_rate": 1.3997801392451448e-05, "loss": 0.3113, "step": 2269 }, { "epoch": 2.245301681503462, "grad_norm": 0.23494139839113953, "learning_rate": 1.3979479662880176e-05, "loss": 0.3006, "step": 2270 }, { "epoch": 2.2462908011869436, "grad_norm": 0.2508107946864829, "learning_rate": 1.3961157933308904e-05, "loss": 0.3602, "step": 2271 }, { "epoch": 2.2472799208704255, "grad_norm": 0.22515857517930243, "learning_rate": 1.3942836203737633e-05, "loss": 0.3296, "step": 2272 }, { "epoch": 2.248269040553907, "grad_norm": 0.25467938360409764, "learning_rate": 1.392451447416636e-05, "loss": 0.3448, "step": 2273 }, { "epoch": 2.249258160237389, "grad_norm": 0.30210334820145374, "learning_rate": 1.3906192744595089e-05, "loss": 0.3662, "step": 2274 }, { "epoch": 2.2502472799208704, "grad_norm": 0.22040906536404206, "learning_rate": 1.3887871015023819e-05, "loss": 0.3222, "step": 2275 }, { "epoch": 2.2512363996043523, "grad_norm": 0.24078053098136895, "learning_rate": 1.3869549285452547e-05, "loss": 0.3578, "step": 2276 }, { "epoch": 2.2522255192878338, "grad_norm": 0.23682978022586784, "learning_rate": 1.3851227555881277e-05, "loss": 0.3418, "step": 2277 }, { "epoch": 2.2532146389713157, "grad_norm": 0.25235210494112936, "learning_rate": 1.3832905826310005e-05, "loss": 0.342, "step": 2278 }, { "epoch": 2.254203758654797, "grad_norm": 0.21464531728228006, "learning_rate": 1.3814584096738733e-05, "loss": 0.2858, "step": 2279 }, { "epoch": 2.255192878338279, "grad_norm": 0.24947023802823698, "learning_rate": 1.3796262367167463e-05, "loss": 0.3581, "step": 2280 }, { "epoch": 2.2561819980217606, "grad_norm": 0.24126311074885234, "learning_rate": 1.377794063759619e-05, "loss": 0.3372, "step": 2281 }, { "epoch": 2.2571711177052425, "grad_norm": 0.24381172903295012, "learning_rate": 1.3759618908024919e-05, "loss": 0.3309, "step": 2282 }, { "epoch": 2.258160237388724, "grad_norm": 0.2216901316551094, "learning_rate": 1.3741297178453647e-05, "loss": 0.2906, "step": 2283 }, { "epoch": 2.259149357072206, "grad_norm": 0.22751523042939115, "learning_rate": 1.3722975448882375e-05, "loss": 0.327, "step": 2284 }, { "epoch": 2.2601384767556874, "grad_norm": 0.2253518058336741, "learning_rate": 1.3704653719311103e-05, "loss": 0.3119, "step": 2285 }, { "epoch": 2.2611275964391693, "grad_norm": 0.2479817150971885, "learning_rate": 1.3686331989739831e-05, "loss": 0.3448, "step": 2286 }, { "epoch": 2.2621167161226508, "grad_norm": 0.2201794942968985, "learning_rate": 1.366801026016856e-05, "loss": 0.3606, "step": 2287 }, { "epoch": 2.2631058358061327, "grad_norm": 0.21364590272690012, "learning_rate": 1.3649688530597287e-05, "loss": 0.3305, "step": 2288 }, { "epoch": 2.264094955489614, "grad_norm": 0.24392483676889015, "learning_rate": 1.3631366801026019e-05, "loss": 0.3393, "step": 2289 }, { "epoch": 2.265084075173096, "grad_norm": 0.24221769435099127, "learning_rate": 1.3613045071454747e-05, "loss": 0.3369, "step": 2290 }, { "epoch": 2.2660731948565775, "grad_norm": 0.24861588082223832, "learning_rate": 1.3594723341883475e-05, "loss": 0.3626, "step": 2291 }, { "epoch": 2.2670623145400595, "grad_norm": 0.23052081299132418, "learning_rate": 1.3576401612312203e-05, "loss": 0.3063, "step": 2292 }, { "epoch": 2.268051434223541, "grad_norm": 0.24275686147196318, "learning_rate": 1.3558079882740931e-05, "loss": 0.3038, "step": 2293 }, { "epoch": 2.269040553907023, "grad_norm": 0.22272342553004223, "learning_rate": 1.353975815316966e-05, "loss": 0.3094, "step": 2294 }, { "epoch": 2.2700296735905043, "grad_norm": 0.23299197318354145, "learning_rate": 1.3521436423598388e-05, "loss": 0.3396, "step": 2295 }, { "epoch": 2.2710187932739863, "grad_norm": 0.22615622555168388, "learning_rate": 1.3503114694027116e-05, "loss": 0.3336, "step": 2296 }, { "epoch": 2.2720079129574677, "grad_norm": 0.2252293872395291, "learning_rate": 1.3484792964455844e-05, "loss": 0.2977, "step": 2297 }, { "epoch": 2.2729970326409497, "grad_norm": 0.23457296789984822, "learning_rate": 1.3466471234884574e-05, "loss": 0.3399, "step": 2298 }, { "epoch": 2.273986152324431, "grad_norm": 0.24129016983914284, "learning_rate": 1.3448149505313302e-05, "loss": 0.3219, "step": 2299 }, { "epoch": 2.274975272007913, "grad_norm": 0.22141667622956007, "learning_rate": 1.342982777574203e-05, "loss": 0.3082, "step": 2300 }, { "epoch": 2.2759643916913945, "grad_norm": 0.2413245232232439, "learning_rate": 1.341150604617076e-05, "loss": 0.3426, "step": 2301 }, { "epoch": 2.2769535113748764, "grad_norm": 0.23696458764674602, "learning_rate": 1.3393184316599488e-05, "loss": 0.3759, "step": 2302 }, { "epoch": 2.277942631058358, "grad_norm": 0.2518570901503138, "learning_rate": 1.3374862587028218e-05, "loss": 0.3695, "step": 2303 }, { "epoch": 2.27893175074184, "grad_norm": 0.23086745437243067, "learning_rate": 1.3356540857456946e-05, "loss": 0.3369, "step": 2304 }, { "epoch": 2.2799208704253213, "grad_norm": 0.26169884977451946, "learning_rate": 1.3338219127885674e-05, "loss": 0.3373, "step": 2305 }, { "epoch": 2.2809099901088032, "grad_norm": 0.2587088888423314, "learning_rate": 1.3319897398314402e-05, "loss": 0.3574, "step": 2306 }, { "epoch": 2.2818991097922847, "grad_norm": 0.22977702824971338, "learning_rate": 1.330157566874313e-05, "loss": 0.3457, "step": 2307 }, { "epoch": 2.2828882294757666, "grad_norm": 0.22721408632878823, "learning_rate": 1.3283253939171858e-05, "loss": 0.308, "step": 2308 }, { "epoch": 2.283877349159248, "grad_norm": 0.2519093156052609, "learning_rate": 1.3264932209600586e-05, "loss": 0.3589, "step": 2309 }, { "epoch": 2.28486646884273, "grad_norm": 0.26712020676598086, "learning_rate": 1.3246610480029314e-05, "loss": 0.3624, "step": 2310 }, { "epoch": 2.2858555885262115, "grad_norm": 0.2326596051044427, "learning_rate": 1.3228288750458042e-05, "loss": 0.3335, "step": 2311 }, { "epoch": 2.2868447082096934, "grad_norm": 0.2765369772968104, "learning_rate": 1.320996702088677e-05, "loss": 0.3757, "step": 2312 }, { "epoch": 2.287833827893175, "grad_norm": 0.25058824305887195, "learning_rate": 1.3191645291315502e-05, "loss": 0.3808, "step": 2313 }, { "epoch": 2.288822947576657, "grad_norm": 0.2248644826213857, "learning_rate": 1.317332356174423e-05, "loss": 0.3213, "step": 2314 }, { "epoch": 2.2898120672601383, "grad_norm": 0.23779476179693096, "learning_rate": 1.3155001832172958e-05, "loss": 0.3293, "step": 2315 }, { "epoch": 2.2908011869436202, "grad_norm": 0.2723138257723175, "learning_rate": 1.3136680102601686e-05, "loss": 0.3428, "step": 2316 }, { "epoch": 2.2917903066271017, "grad_norm": 0.2616699975935992, "learning_rate": 1.3118358373030415e-05, "loss": 0.3454, "step": 2317 }, { "epoch": 2.2927794263105836, "grad_norm": 0.24102049258430722, "learning_rate": 1.3100036643459143e-05, "loss": 0.3333, "step": 2318 }, { "epoch": 2.293768545994065, "grad_norm": 0.23650754741019725, "learning_rate": 1.308171491388787e-05, "loss": 0.3123, "step": 2319 }, { "epoch": 2.294757665677547, "grad_norm": 0.37538745041783184, "learning_rate": 1.3063393184316599e-05, "loss": 0.369, "step": 2320 }, { "epoch": 2.2957467853610285, "grad_norm": 0.23666732747554203, "learning_rate": 1.3045071454745329e-05, "loss": 0.316, "step": 2321 }, { "epoch": 2.2967359050445104, "grad_norm": 0.22954722561453486, "learning_rate": 1.3026749725174057e-05, "loss": 0.3216, "step": 2322 }, { "epoch": 2.297725024727992, "grad_norm": 0.2490149729269961, "learning_rate": 1.3008427995602785e-05, "loss": 0.3144, "step": 2323 }, { "epoch": 2.298714144411474, "grad_norm": 0.25850900249259257, "learning_rate": 1.2990106266031515e-05, "loss": 0.3166, "step": 2324 }, { "epoch": 2.2997032640949557, "grad_norm": 0.26057122713059194, "learning_rate": 1.2971784536460243e-05, "loss": 0.3526, "step": 2325 }, { "epoch": 2.300692383778437, "grad_norm": 0.24491367081125778, "learning_rate": 1.2953462806888973e-05, "loss": 0.3525, "step": 2326 }, { "epoch": 2.3016815034619187, "grad_norm": 0.2760339712557652, "learning_rate": 1.29351410773177e-05, "loss": 0.3596, "step": 2327 }, { "epoch": 2.3026706231454006, "grad_norm": 0.24073219741729474, "learning_rate": 1.2916819347746429e-05, "loss": 0.3269, "step": 2328 }, { "epoch": 2.3036597428288825, "grad_norm": 0.2703548932443427, "learning_rate": 1.2898497618175157e-05, "loss": 0.3219, "step": 2329 }, { "epoch": 2.304648862512364, "grad_norm": 0.2687127704366253, "learning_rate": 1.2880175888603885e-05, "loss": 0.3417, "step": 2330 }, { "epoch": 2.3056379821958455, "grad_norm": 0.25270161397798035, "learning_rate": 1.2861854159032613e-05, "loss": 0.3714, "step": 2331 }, { "epoch": 2.3066271018793274, "grad_norm": 0.22146824391487552, "learning_rate": 1.2843532429461341e-05, "loss": 0.3217, "step": 2332 }, { "epoch": 2.3076162215628093, "grad_norm": 0.2558572987685534, "learning_rate": 1.282521069989007e-05, "loss": 0.3753, "step": 2333 }, { "epoch": 2.308605341246291, "grad_norm": 0.25523094828283194, "learning_rate": 1.2806888970318798e-05, "loss": 0.3419, "step": 2334 }, { "epoch": 2.3095944609297723, "grad_norm": 0.24281022901050803, "learning_rate": 1.2788567240747526e-05, "loss": 0.3487, "step": 2335 }, { "epoch": 2.310583580613254, "grad_norm": 0.2748946608527433, "learning_rate": 1.2770245511176257e-05, "loss": 0.3927, "step": 2336 }, { "epoch": 2.311572700296736, "grad_norm": 0.24688576678949595, "learning_rate": 1.2751923781604985e-05, "loss": 0.3877, "step": 2337 }, { "epoch": 2.3125618199802176, "grad_norm": 0.27181775950372444, "learning_rate": 1.2733602052033713e-05, "loss": 0.3822, "step": 2338 }, { "epoch": 2.313550939663699, "grad_norm": 0.22921656595976417, "learning_rate": 1.2715280322462441e-05, "loss": 0.2951, "step": 2339 }, { "epoch": 2.314540059347181, "grad_norm": 0.2575726813551104, "learning_rate": 1.269695859289117e-05, "loss": 0.3406, "step": 2340 }, { "epoch": 2.315529179030663, "grad_norm": 0.2555589633497033, "learning_rate": 1.2678636863319898e-05, "loss": 0.3396, "step": 2341 }, { "epoch": 2.3165182987141444, "grad_norm": 0.2364187403064698, "learning_rate": 1.2660315133748626e-05, "loss": 0.377, "step": 2342 }, { "epoch": 2.3175074183976263, "grad_norm": 0.2302096798238319, "learning_rate": 1.2641993404177354e-05, "loss": 0.3319, "step": 2343 }, { "epoch": 2.318496538081108, "grad_norm": 0.24888650385331362, "learning_rate": 1.2623671674606084e-05, "loss": 0.3066, "step": 2344 }, { "epoch": 2.3194856577645897, "grad_norm": 0.2773610707597904, "learning_rate": 1.2605349945034812e-05, "loss": 0.3414, "step": 2345 }, { "epoch": 2.320474777448071, "grad_norm": 0.2154430712098479, "learning_rate": 1.258702821546354e-05, "loss": 0.2968, "step": 2346 }, { "epoch": 2.321463897131553, "grad_norm": 0.2338043867905404, "learning_rate": 1.2568706485892268e-05, "loss": 0.3118, "step": 2347 }, { "epoch": 2.3224530168150346, "grad_norm": 0.24602167457573554, "learning_rate": 1.2550384756320998e-05, "loss": 0.3702, "step": 2348 }, { "epoch": 2.3234421364985165, "grad_norm": 0.2327990964511457, "learning_rate": 1.2532063026749728e-05, "loss": 0.3523, "step": 2349 }, { "epoch": 2.324431256181998, "grad_norm": 0.2589528675798955, "learning_rate": 1.2513741297178456e-05, "loss": 0.3864, "step": 2350 }, { "epoch": 2.32542037586548, "grad_norm": 0.26176208543972257, "learning_rate": 1.2495419567607184e-05, "loss": 0.3336, "step": 2351 }, { "epoch": 2.3264094955489614, "grad_norm": 0.2844298379253604, "learning_rate": 1.2477097838035912e-05, "loss": 0.3492, "step": 2352 }, { "epoch": 2.3273986152324433, "grad_norm": 0.2546167667799685, "learning_rate": 1.245877610846464e-05, "loss": 0.3555, "step": 2353 }, { "epoch": 2.3283877349159248, "grad_norm": 0.2696964327271954, "learning_rate": 1.2440454378893368e-05, "loss": 0.3706, "step": 2354 }, { "epoch": 2.3293768545994067, "grad_norm": 0.31827984227427614, "learning_rate": 1.2422132649322096e-05, "loss": 0.332, "step": 2355 }, { "epoch": 2.330365974282888, "grad_norm": 0.2240056160701879, "learning_rate": 1.2403810919750824e-05, "loss": 0.3328, "step": 2356 }, { "epoch": 2.33135509396637, "grad_norm": 0.2517740200145697, "learning_rate": 1.2385489190179554e-05, "loss": 0.372, "step": 2357 }, { "epoch": 2.3323442136498516, "grad_norm": 0.24940675998881365, "learning_rate": 1.2367167460608282e-05, "loss": 0.3959, "step": 2358 }, { "epoch": 2.3333333333333335, "grad_norm": 0.2459642937733617, "learning_rate": 1.234884573103701e-05, "loss": 0.3492, "step": 2359 }, { "epoch": 2.334322453016815, "grad_norm": 0.22768174832698604, "learning_rate": 1.2330524001465739e-05, "loss": 0.3404, "step": 2360 }, { "epoch": 2.335311572700297, "grad_norm": 0.22086915383890057, "learning_rate": 1.2312202271894467e-05, "loss": 0.3099, "step": 2361 }, { "epoch": 2.3363006923837784, "grad_norm": 0.22113520177366824, "learning_rate": 1.2293880542323195e-05, "loss": 0.3393, "step": 2362 }, { "epoch": 2.3372898120672603, "grad_norm": 2.991113459889468, "learning_rate": 1.2275558812751925e-05, "loss": 0.3309, "step": 2363 }, { "epoch": 2.3382789317507418, "grad_norm": 0.29054610148791177, "learning_rate": 1.2257237083180653e-05, "loss": 0.3655, "step": 2364 }, { "epoch": 2.3392680514342237, "grad_norm": 0.24093202591978707, "learning_rate": 1.2238915353609381e-05, "loss": 0.3428, "step": 2365 }, { "epoch": 2.340257171117705, "grad_norm": 0.23736300980535247, "learning_rate": 1.2220593624038109e-05, "loss": 0.3229, "step": 2366 }, { "epoch": 2.341246290801187, "grad_norm": 0.35258591516934035, "learning_rate": 1.2202271894466839e-05, "loss": 0.3363, "step": 2367 }, { "epoch": 2.3422354104846685, "grad_norm": 0.26676374709095013, "learning_rate": 1.2183950164895567e-05, "loss": 0.3211, "step": 2368 }, { "epoch": 2.3432245301681505, "grad_norm": 0.2351851020565668, "learning_rate": 1.2165628435324295e-05, "loss": 0.35, "step": 2369 }, { "epoch": 2.344213649851632, "grad_norm": 0.22232852533104622, "learning_rate": 1.2147306705753025e-05, "loss": 0.31, "step": 2370 }, { "epoch": 2.345202769535114, "grad_norm": 0.23680359247744562, "learning_rate": 1.2128984976181753e-05, "loss": 0.3478, "step": 2371 }, { "epoch": 2.3461918892185953, "grad_norm": 0.24988403986844843, "learning_rate": 1.2110663246610481e-05, "loss": 0.3635, "step": 2372 }, { "epoch": 2.3471810089020773, "grad_norm": 0.24767501874015097, "learning_rate": 1.2092341517039209e-05, "loss": 0.3883, "step": 2373 }, { "epoch": 2.3481701285855587, "grad_norm": 0.2521283236697681, "learning_rate": 1.2074019787467937e-05, "loss": 0.4225, "step": 2374 }, { "epoch": 2.3491592482690407, "grad_norm": 0.24675041358192287, "learning_rate": 1.2055698057896667e-05, "loss": 0.3663, "step": 2375 }, { "epoch": 2.350148367952522, "grad_norm": 0.24311636349663465, "learning_rate": 1.2037376328325395e-05, "loss": 0.3759, "step": 2376 }, { "epoch": 2.351137487636004, "grad_norm": 0.24263994864791805, "learning_rate": 1.2019054598754123e-05, "loss": 0.38, "step": 2377 }, { "epoch": 2.3521266073194855, "grad_norm": 0.25402272877139787, "learning_rate": 1.2000732869182851e-05, "loss": 0.31, "step": 2378 }, { "epoch": 2.3531157270029674, "grad_norm": 0.2495073709635195, "learning_rate": 1.198241113961158e-05, "loss": 0.3659, "step": 2379 }, { "epoch": 2.354104846686449, "grad_norm": 0.2298654377866694, "learning_rate": 1.1964089410040308e-05, "loss": 0.3209, "step": 2380 }, { "epoch": 2.355093966369931, "grad_norm": 0.28863897336682676, "learning_rate": 1.1945767680469037e-05, "loss": 0.3417, "step": 2381 }, { "epoch": 2.3560830860534123, "grad_norm": 0.41489670024603986, "learning_rate": 1.1927445950897766e-05, "loss": 0.3503, "step": 2382 }, { "epoch": 2.3570722057368942, "grad_norm": 0.2205422492533083, "learning_rate": 1.1909124221326494e-05, "loss": 0.3529, "step": 2383 }, { "epoch": 2.3580613254203757, "grad_norm": 0.2364704954092965, "learning_rate": 1.1890802491755222e-05, "loss": 0.3625, "step": 2384 }, { "epoch": 2.3590504451038576, "grad_norm": 0.2476419419748873, "learning_rate": 1.187248076218395e-05, "loss": 0.3181, "step": 2385 }, { "epoch": 2.360039564787339, "grad_norm": 0.24541438121836218, "learning_rate": 1.1854159032612678e-05, "loss": 0.3931, "step": 2386 }, { "epoch": 2.361028684470821, "grad_norm": 0.23259123379943467, "learning_rate": 1.1835837303041408e-05, "loss": 0.3599, "step": 2387 }, { "epoch": 2.3620178041543025, "grad_norm": 0.24998847686378414, "learning_rate": 1.1817515573470136e-05, "loss": 0.3454, "step": 2388 }, { "epoch": 2.3630069238377844, "grad_norm": 0.2488071258698388, "learning_rate": 1.1799193843898864e-05, "loss": 0.3633, "step": 2389 }, { "epoch": 2.363996043521266, "grad_norm": 0.22105819528108095, "learning_rate": 1.1780872114327594e-05, "loss": 0.332, "step": 2390 }, { "epoch": 2.364985163204748, "grad_norm": 0.2334569251699502, "learning_rate": 1.1762550384756322e-05, "loss": 0.3415, "step": 2391 }, { "epoch": 2.3659742828882293, "grad_norm": 0.27195122783585196, "learning_rate": 1.174422865518505e-05, "loss": 0.3633, "step": 2392 }, { "epoch": 2.3669634025717112, "grad_norm": 0.2687234363535735, "learning_rate": 1.172590692561378e-05, "loss": 0.3223, "step": 2393 }, { "epoch": 2.3679525222551927, "grad_norm": 0.27418695750053923, "learning_rate": 1.1707585196042508e-05, "loss": 0.348, "step": 2394 }, { "epoch": 2.3689416419386746, "grad_norm": 0.2250392951845017, "learning_rate": 1.1689263466471236e-05, "loss": 0.3367, "step": 2395 }, { "epoch": 2.369930761622156, "grad_norm": 0.2428040281312013, "learning_rate": 1.1670941736899964e-05, "loss": 0.3316, "step": 2396 }, { "epoch": 2.370919881305638, "grad_norm": 0.257913044877425, "learning_rate": 1.1652620007328692e-05, "loss": 0.3541, "step": 2397 }, { "epoch": 2.3719090009891195, "grad_norm": 0.24888673843156847, "learning_rate": 1.163429827775742e-05, "loss": 0.2872, "step": 2398 }, { "epoch": 2.3728981206726014, "grad_norm": 0.22992256914847725, "learning_rate": 1.161597654818615e-05, "loss": 0.3668, "step": 2399 }, { "epoch": 2.373887240356083, "grad_norm": 0.23105699693670317, "learning_rate": 1.1597654818614878e-05, "loss": 0.3354, "step": 2400 }, { "epoch": 2.374876360039565, "grad_norm": 0.23878238019277057, "learning_rate": 1.1579333089043606e-05, "loss": 0.3339, "step": 2401 }, { "epoch": 2.3758654797230463, "grad_norm": 0.23787914735351112, "learning_rate": 1.1561011359472335e-05, "loss": 0.3409, "step": 2402 }, { "epoch": 2.376854599406528, "grad_norm": 0.21777157204428282, "learning_rate": 1.1542689629901063e-05, "loss": 0.3152, "step": 2403 }, { "epoch": 2.37784371909001, "grad_norm": 0.23084255588965574, "learning_rate": 1.152436790032979e-05, "loss": 0.3406, "step": 2404 }, { "epoch": 2.3788328387734916, "grad_norm": 0.23000546672119904, "learning_rate": 1.150604617075852e-05, "loss": 0.3362, "step": 2405 }, { "epoch": 2.379821958456973, "grad_norm": 0.24929393096074584, "learning_rate": 1.1487724441187249e-05, "loss": 0.3395, "step": 2406 }, { "epoch": 2.380811078140455, "grad_norm": 0.2253847100064449, "learning_rate": 1.1469402711615977e-05, "loss": 0.2843, "step": 2407 }, { "epoch": 2.381800197823937, "grad_norm": 0.21736855996444068, "learning_rate": 1.1451080982044705e-05, "loss": 0.3224, "step": 2408 }, { "epoch": 2.3827893175074184, "grad_norm": 0.26077964199405224, "learning_rate": 1.1432759252473433e-05, "loss": 0.3754, "step": 2409 }, { "epoch": 2.3837784371909, "grad_norm": 0.23441329142827347, "learning_rate": 1.1414437522902161e-05, "loss": 0.3193, "step": 2410 }, { "epoch": 2.384767556874382, "grad_norm": 0.22460852238892978, "learning_rate": 1.1396115793330891e-05, "loss": 0.3019, "step": 2411 }, { "epoch": 2.3857566765578637, "grad_norm": 0.22061710031425286, "learning_rate": 1.1377794063759619e-05, "loss": 0.329, "step": 2412 }, { "epoch": 2.386745796241345, "grad_norm": 0.23283333531597325, "learning_rate": 1.1359472334188349e-05, "loss": 0.3475, "step": 2413 }, { "epoch": 2.3877349159248267, "grad_norm": 0.23549258278998214, "learning_rate": 1.1341150604617077e-05, "loss": 0.3681, "step": 2414 }, { "epoch": 2.3887240356083086, "grad_norm": 0.23746707960246552, "learning_rate": 1.1322828875045805e-05, "loss": 0.3218, "step": 2415 }, { "epoch": 2.3897131552917905, "grad_norm": 0.21232937070558, "learning_rate": 1.1304507145474533e-05, "loss": 0.3096, "step": 2416 }, { "epoch": 2.390702274975272, "grad_norm": 0.22158582934961538, "learning_rate": 1.1286185415903263e-05, "loss": 0.3158, "step": 2417 }, { "epoch": 2.3916913946587535, "grad_norm": 0.2481025761788115, "learning_rate": 1.1267863686331991e-05, "loss": 0.3196, "step": 2418 }, { "epoch": 2.3926805143422354, "grad_norm": 0.24337539698694738, "learning_rate": 1.124954195676072e-05, "loss": 0.3259, "step": 2419 }, { "epoch": 2.3936696340257173, "grad_norm": 0.228969243738579, "learning_rate": 1.1231220227189447e-05, "loss": 0.3427, "step": 2420 }, { "epoch": 2.394658753709199, "grad_norm": 0.22025974633504555, "learning_rate": 1.1212898497618175e-05, "loss": 0.3483, "step": 2421 }, { "epoch": 2.3956478733926807, "grad_norm": 0.26184165101054785, "learning_rate": 1.1194576768046904e-05, "loss": 0.3828, "step": 2422 }, { "epoch": 2.396636993076162, "grad_norm": 0.24478098303647366, "learning_rate": 1.1176255038475633e-05, "loss": 0.3599, "step": 2423 }, { "epoch": 2.397626112759644, "grad_norm": 0.24261159243488178, "learning_rate": 1.1157933308904361e-05, "loss": 0.3112, "step": 2424 }, { "epoch": 2.3986152324431256, "grad_norm": 0.22301944660446946, "learning_rate": 1.113961157933309e-05, "loss": 0.3214, "step": 2425 }, { "epoch": 2.3996043521266075, "grad_norm": 0.24202065872251996, "learning_rate": 1.1121289849761818e-05, "loss": 0.3373, "step": 2426 }, { "epoch": 2.400593471810089, "grad_norm": 0.26782140193309023, "learning_rate": 1.1102968120190546e-05, "loss": 0.3926, "step": 2427 }, { "epoch": 2.401582591493571, "grad_norm": 0.24029910736845295, "learning_rate": 1.1084646390619274e-05, "loss": 0.368, "step": 2428 }, { "epoch": 2.4025717111770524, "grad_norm": 0.2407671258225506, "learning_rate": 1.1066324661048004e-05, "loss": 0.3423, "step": 2429 }, { "epoch": 2.4035608308605343, "grad_norm": 0.24572982261117407, "learning_rate": 1.1048002931476732e-05, "loss": 0.3482, "step": 2430 }, { "epoch": 2.4045499505440158, "grad_norm": 0.2788166813071352, "learning_rate": 1.102968120190546e-05, "loss": 0.3574, "step": 2431 }, { "epoch": 2.4055390702274977, "grad_norm": 0.23296254668053415, "learning_rate": 1.1011359472334188e-05, "loss": 0.3137, "step": 2432 }, { "epoch": 2.406528189910979, "grad_norm": 0.24801668753216083, "learning_rate": 1.0993037742762916e-05, "loss": 0.3509, "step": 2433 }, { "epoch": 2.407517309594461, "grad_norm": 0.25891621196791104, "learning_rate": 1.0974716013191646e-05, "loss": 0.3266, "step": 2434 }, { "epoch": 2.4085064292779426, "grad_norm": 0.25721058330749236, "learning_rate": 1.0956394283620374e-05, "loss": 0.3299, "step": 2435 }, { "epoch": 2.4094955489614245, "grad_norm": 0.23910805211819738, "learning_rate": 1.0938072554049102e-05, "loss": 0.3222, "step": 2436 }, { "epoch": 2.410484668644906, "grad_norm": 0.7585722434930996, "learning_rate": 1.0919750824477832e-05, "loss": 0.3307, "step": 2437 }, { "epoch": 2.411473788328388, "grad_norm": 0.2622094628177202, "learning_rate": 1.090142909490656e-05, "loss": 0.3657, "step": 2438 }, { "epoch": 2.4124629080118694, "grad_norm": 0.24659250738864202, "learning_rate": 1.0883107365335288e-05, "loss": 0.3411, "step": 2439 }, { "epoch": 2.4134520276953513, "grad_norm": 0.24754083987784184, "learning_rate": 1.0864785635764016e-05, "loss": 0.3949, "step": 2440 }, { "epoch": 2.4144411473788328, "grad_norm": 0.2300523326797196, "learning_rate": 1.0846463906192746e-05, "loss": 0.3217, "step": 2441 }, { "epoch": 2.4154302670623147, "grad_norm": 0.268740945166187, "learning_rate": 1.0828142176621474e-05, "loss": 0.3416, "step": 2442 }, { "epoch": 2.416419386745796, "grad_norm": 0.25252939297829813, "learning_rate": 1.0809820447050202e-05, "loss": 0.3397, "step": 2443 }, { "epoch": 2.417408506429278, "grad_norm": 0.2530969379846874, "learning_rate": 1.079149871747893e-05, "loss": 0.3864, "step": 2444 }, { "epoch": 2.4183976261127595, "grad_norm": 0.24562207385794602, "learning_rate": 1.0773176987907659e-05, "loss": 0.3548, "step": 2445 }, { "epoch": 2.4193867457962415, "grad_norm": 0.2441336356073428, "learning_rate": 1.0754855258336387e-05, "loss": 0.3508, "step": 2446 }, { "epoch": 2.420375865479723, "grad_norm": 0.33254572988738407, "learning_rate": 1.0736533528765116e-05, "loss": 0.3308, "step": 2447 }, { "epoch": 2.421364985163205, "grad_norm": 0.33598359004717754, "learning_rate": 1.0718211799193845e-05, "loss": 0.3371, "step": 2448 }, { "epoch": 2.4223541048466863, "grad_norm": 0.24536171632195142, "learning_rate": 1.0699890069622573e-05, "loss": 0.346, "step": 2449 }, { "epoch": 2.4233432245301683, "grad_norm": 0.220072693747127, "learning_rate": 1.06815683400513e-05, "loss": 0.3051, "step": 2450 }, { "epoch": 2.4243323442136497, "grad_norm": 0.26313981909407597, "learning_rate": 1.0663246610480029e-05, "loss": 0.3351, "step": 2451 }, { "epoch": 2.4253214638971317, "grad_norm": 0.2710866024316405, "learning_rate": 1.0644924880908757e-05, "loss": 0.3164, "step": 2452 }, { "epoch": 2.426310583580613, "grad_norm": 0.25396458294554114, "learning_rate": 1.0626603151337487e-05, "loss": 0.3463, "step": 2453 }, { "epoch": 2.427299703264095, "grad_norm": 0.2252749311775426, "learning_rate": 1.0608281421766215e-05, "loss": 0.354, "step": 2454 }, { "epoch": 2.4282888229475765, "grad_norm": 0.2606999859832173, "learning_rate": 1.0589959692194943e-05, "loss": 0.3511, "step": 2455 }, { "epoch": 2.4292779426310585, "grad_norm": 0.37868283734687525, "learning_rate": 1.0571637962623671e-05, "loss": 0.361, "step": 2456 }, { "epoch": 2.43026706231454, "grad_norm": 0.23778601344518252, "learning_rate": 1.0553316233052401e-05, "loss": 0.3375, "step": 2457 }, { "epoch": 2.431256181998022, "grad_norm": 0.23759219268753948, "learning_rate": 1.0534994503481129e-05, "loss": 0.3371, "step": 2458 }, { "epoch": 2.4322453016815033, "grad_norm": 0.22579121090246385, "learning_rate": 1.0516672773909857e-05, "loss": 0.3615, "step": 2459 }, { "epoch": 2.4332344213649852, "grad_norm": 0.23369877437828138, "learning_rate": 1.0498351044338587e-05, "loss": 0.375, "step": 2460 }, { "epoch": 2.4342235410484667, "grad_norm": 0.22961846338817127, "learning_rate": 1.0480029314767315e-05, "loss": 0.3484, "step": 2461 }, { "epoch": 2.4352126607319486, "grad_norm": 0.22887264558327763, "learning_rate": 1.0461707585196043e-05, "loss": 0.3385, "step": 2462 }, { "epoch": 2.43620178041543, "grad_norm": 0.25032092756268276, "learning_rate": 1.0443385855624771e-05, "loss": 0.3151, "step": 2463 }, { "epoch": 2.437190900098912, "grad_norm": 0.23728928057357426, "learning_rate": 1.0425064126053501e-05, "loss": 0.3813, "step": 2464 }, { "epoch": 2.4381800197823935, "grad_norm": 0.2438827034129295, "learning_rate": 1.040674239648223e-05, "loss": 0.3659, "step": 2465 }, { "epoch": 2.4391691394658754, "grad_norm": 0.2778214140943854, "learning_rate": 1.0388420666910957e-05, "loss": 0.3513, "step": 2466 }, { "epoch": 2.440158259149357, "grad_norm": 0.22815945841865654, "learning_rate": 1.0370098937339686e-05, "loss": 0.3447, "step": 2467 }, { "epoch": 2.441147378832839, "grad_norm": 0.23528405331897878, "learning_rate": 1.0351777207768414e-05, "loss": 0.3352, "step": 2468 }, { "epoch": 2.4421364985163203, "grad_norm": 0.235995080768111, "learning_rate": 1.0333455478197142e-05, "loss": 0.3263, "step": 2469 }, { "epoch": 2.4431256181998022, "grad_norm": 0.21678114600038997, "learning_rate": 1.0315133748625872e-05, "loss": 0.3112, "step": 2470 }, { "epoch": 2.4441147378832837, "grad_norm": 0.25806394891487444, "learning_rate": 1.02968120190546e-05, "loss": 0.3551, "step": 2471 }, { "epoch": 2.4451038575667656, "grad_norm": 0.26401170188018913, "learning_rate": 1.0278490289483328e-05, "loss": 0.39, "step": 2472 }, { "epoch": 2.446092977250247, "grad_norm": 0.21740446151393822, "learning_rate": 1.0260168559912056e-05, "loss": 0.3286, "step": 2473 }, { "epoch": 2.447082096933729, "grad_norm": 0.23443927897272304, "learning_rate": 1.0241846830340784e-05, "loss": 0.3652, "step": 2474 }, { "epoch": 2.4480712166172105, "grad_norm": 0.218480918457683, "learning_rate": 1.0223525100769512e-05, "loss": 0.3383, "step": 2475 }, { "epoch": 2.4490603363006924, "grad_norm": 0.2203346851930569, "learning_rate": 1.0205203371198242e-05, "loss": 0.3401, "step": 2476 }, { "epoch": 2.450049455984174, "grad_norm": 0.22869849971032385, "learning_rate": 1.018688164162697e-05, "loss": 0.3308, "step": 2477 }, { "epoch": 2.451038575667656, "grad_norm": 0.2363096068659232, "learning_rate": 1.0168559912055698e-05, "loss": 0.3628, "step": 2478 }, { "epoch": 2.4520276953511377, "grad_norm": 0.21724450496142547, "learning_rate": 1.0150238182484426e-05, "loss": 0.3435, "step": 2479 }, { "epoch": 2.453016815034619, "grad_norm": 0.24107977424377153, "learning_rate": 1.0131916452913156e-05, "loss": 0.365, "step": 2480 }, { "epoch": 2.4540059347181007, "grad_norm": 0.24828390832939212, "learning_rate": 1.0113594723341884e-05, "loss": 0.3366, "step": 2481 }, { "epoch": 2.4549950544015826, "grad_norm": 0.20569807648782604, "learning_rate": 1.0095272993770612e-05, "loss": 0.2761, "step": 2482 }, { "epoch": 2.4559841740850645, "grad_norm": 0.2322910616182908, "learning_rate": 1.0076951264199342e-05, "loss": 0.3741, "step": 2483 }, { "epoch": 2.456973293768546, "grad_norm": 0.24079586769347716, "learning_rate": 1.005862953462807e-05, "loss": 0.3657, "step": 2484 }, { "epoch": 2.4579624134520275, "grad_norm": 0.24451423706174355, "learning_rate": 1.0040307805056798e-05, "loss": 0.3375, "step": 2485 }, { "epoch": 2.4589515331355094, "grad_norm": 0.21789521602688885, "learning_rate": 1.0021986075485526e-05, "loss": 0.3115, "step": 2486 }, { "epoch": 2.4599406528189913, "grad_norm": 0.2367007818716319, "learning_rate": 1.0003664345914255e-05, "loss": 0.3434, "step": 2487 }, { "epoch": 2.460929772502473, "grad_norm": 0.2307538273441414, "learning_rate": 9.985342616342984e-06, "loss": 0.3279, "step": 2488 }, { "epoch": 2.4619188921859543, "grad_norm": 0.2366577365331995, "learning_rate": 9.967020886771712e-06, "loss": 0.3405, "step": 2489 }, { "epoch": 2.462908011869436, "grad_norm": 0.21292878456865053, "learning_rate": 9.94869915720044e-06, "loss": 0.3271, "step": 2490 }, { "epoch": 2.463897131552918, "grad_norm": 0.21294135453400145, "learning_rate": 9.930377427629169e-06, "loss": 0.312, "step": 2491 }, { "epoch": 2.4648862512363996, "grad_norm": 0.2406136238914857, "learning_rate": 9.912055698057897e-06, "loss": 0.3537, "step": 2492 }, { "epoch": 2.465875370919881, "grad_norm": 0.25484410374642147, "learning_rate": 9.893733968486625e-06, "loss": 0.3134, "step": 2493 }, { "epoch": 2.466864490603363, "grad_norm": 0.2620293766154584, "learning_rate": 9.875412238915355e-06, "loss": 0.4176, "step": 2494 }, { "epoch": 2.467853610286845, "grad_norm": 0.23944770550967767, "learning_rate": 9.857090509344083e-06, "loss": 0.356, "step": 2495 }, { "epoch": 2.4688427299703264, "grad_norm": 0.25086364175393056, "learning_rate": 9.838768779772811e-06, "loss": 0.3575, "step": 2496 }, { "epoch": 2.469831849653808, "grad_norm": 0.23501454699640065, "learning_rate": 9.820447050201539e-06, "loss": 0.3876, "step": 2497 }, { "epoch": 2.47082096933729, "grad_norm": 0.2551766847463871, "learning_rate": 9.802125320630267e-06, "loss": 0.3367, "step": 2498 }, { "epoch": 2.4718100890207717, "grad_norm": 0.2361580571158426, "learning_rate": 9.783803591058995e-06, "loss": 0.4009, "step": 2499 }, { "epoch": 2.472799208704253, "grad_norm": 1.4900062991141605, "learning_rate": 9.765481861487725e-06, "loss": 0.3581, "step": 2500 }, { "epoch": 2.473788328387735, "grad_norm": 0.23073936660230762, "learning_rate": 9.747160131916453e-06, "loss": 0.3565, "step": 2501 }, { "epoch": 2.4747774480712166, "grad_norm": 0.24682663116703085, "learning_rate": 9.728838402345181e-06, "loss": 0.3535, "step": 2502 }, { "epoch": 2.4757665677546985, "grad_norm": 0.27034996608934037, "learning_rate": 9.710516672773911e-06, "loss": 0.3916, "step": 2503 }, { "epoch": 2.47675568743818, "grad_norm": 0.2303037854565598, "learning_rate": 9.69219494320264e-06, "loss": 0.3436, "step": 2504 }, { "epoch": 2.477744807121662, "grad_norm": 0.23585596331964975, "learning_rate": 9.673873213631367e-06, "loss": 0.3118, "step": 2505 }, { "epoch": 2.4787339268051434, "grad_norm": 0.23726271220235548, "learning_rate": 9.655551484060097e-06, "loss": 0.3394, "step": 2506 }, { "epoch": 2.4797230464886253, "grad_norm": 0.2487286182774457, "learning_rate": 9.637229754488825e-06, "loss": 0.3573, "step": 2507 }, { "epoch": 2.4807121661721068, "grad_norm": 0.2698429096752246, "learning_rate": 9.618908024917553e-06, "loss": 0.3876, "step": 2508 }, { "epoch": 2.4817012858555887, "grad_norm": 0.24070333522567572, "learning_rate": 9.600586295346281e-06, "loss": 0.3465, "step": 2509 }, { "epoch": 2.48269040553907, "grad_norm": 0.24206724387701353, "learning_rate": 9.58226456577501e-06, "loss": 0.3652, "step": 2510 }, { "epoch": 2.483679525222552, "grad_norm": 0.22545629834132364, "learning_rate": 9.563942836203738e-06, "loss": 0.3275, "step": 2511 }, { "epoch": 2.4846686449060336, "grad_norm": 0.23858808844243973, "learning_rate": 9.545621106632467e-06, "loss": 0.3505, "step": 2512 }, { "epoch": 2.4856577645895155, "grad_norm": 0.2728863075026196, "learning_rate": 9.527299377061196e-06, "loss": 0.3541, "step": 2513 }, { "epoch": 2.486646884272997, "grad_norm": 0.23294695154773748, "learning_rate": 9.508977647489924e-06, "loss": 0.3574, "step": 2514 }, { "epoch": 2.487636003956479, "grad_norm": 0.24124966388283223, "learning_rate": 9.490655917918652e-06, "loss": 0.3243, "step": 2515 }, { "epoch": 2.4886251236399604, "grad_norm": 0.2320389035173641, "learning_rate": 9.47233418834738e-06, "loss": 0.3245, "step": 2516 }, { "epoch": 2.4896142433234423, "grad_norm": 0.26700657514143444, "learning_rate": 9.454012458776108e-06, "loss": 0.3852, "step": 2517 }, { "epoch": 2.4906033630069238, "grad_norm": 0.2507593747565587, "learning_rate": 9.435690729204838e-06, "loss": 0.3476, "step": 2518 }, { "epoch": 2.4915924826904057, "grad_norm": 0.21748192964113497, "learning_rate": 9.417368999633566e-06, "loss": 0.3334, "step": 2519 }, { "epoch": 2.492581602373887, "grad_norm": 0.25264072777527896, "learning_rate": 9.399047270062294e-06, "loss": 0.3685, "step": 2520 }, { "epoch": 2.493570722057369, "grad_norm": 0.24839443510243983, "learning_rate": 9.380725540491022e-06, "loss": 0.3523, "step": 2521 }, { "epoch": 2.4945598417408505, "grad_norm": 0.22023654916565122, "learning_rate": 9.36240381091975e-06, "loss": 0.32, "step": 2522 }, { "epoch": 2.4955489614243325, "grad_norm": 0.2441603761072515, "learning_rate": 9.344082081348478e-06, "loss": 0.3677, "step": 2523 }, { "epoch": 2.496538081107814, "grad_norm": 0.23410429303428584, "learning_rate": 9.325760351777208e-06, "loss": 0.3282, "step": 2524 }, { "epoch": 2.497527200791296, "grad_norm": 0.5284404116687907, "learning_rate": 9.307438622205936e-06, "loss": 0.3842, "step": 2525 }, { "epoch": 2.4985163204747773, "grad_norm": 0.2597369164991658, "learning_rate": 9.289116892634666e-06, "loss": 0.3564, "step": 2526 }, { "epoch": 2.4995054401582593, "grad_norm": 1.5905916143954588, "learning_rate": 9.270795163063394e-06, "loss": 0.4346, "step": 2527 }, { "epoch": 2.5004945598417407, "grad_norm": 0.2406750160968987, "learning_rate": 9.252473433492122e-06, "loss": 0.3277, "step": 2528 }, { "epoch": 2.5014836795252227, "grad_norm": 0.25687044415825083, "learning_rate": 9.23415170392085e-06, "loss": 0.3453, "step": 2529 }, { "epoch": 2.502472799208704, "grad_norm": 0.25135048181774106, "learning_rate": 9.21582997434958e-06, "loss": 0.3547, "step": 2530 }, { "epoch": 2.503461918892186, "grad_norm": 0.24560280016284752, "learning_rate": 9.197508244778308e-06, "loss": 0.3661, "step": 2531 }, { "epoch": 2.5044510385756675, "grad_norm": 0.49064750366441084, "learning_rate": 9.179186515207036e-06, "loss": 0.3812, "step": 2532 }, { "epoch": 2.5054401582591495, "grad_norm": 0.22993415291097227, "learning_rate": 9.160864785635765e-06, "loss": 0.3637, "step": 2533 }, { "epoch": 2.506429277942631, "grad_norm": 0.21741000097273247, "learning_rate": 9.142543056064493e-06, "loss": 0.3319, "step": 2534 }, { "epoch": 2.507418397626113, "grad_norm": 0.22510085134096802, "learning_rate": 9.12422132649322e-06, "loss": 0.3054, "step": 2535 }, { "epoch": 2.5084075173095943, "grad_norm": 0.23033682251844267, "learning_rate": 9.10589959692195e-06, "loss": 0.3684, "step": 2536 }, { "epoch": 2.5093966369930762, "grad_norm": 0.23336881930405867, "learning_rate": 9.087577867350679e-06, "loss": 0.336, "step": 2537 }, { "epoch": 2.5103857566765577, "grad_norm": 0.25835519741672547, "learning_rate": 9.069256137779407e-06, "loss": 0.3665, "step": 2538 }, { "epoch": 2.5113748763600396, "grad_norm": 0.21527278722575882, "learning_rate": 9.050934408208135e-06, "loss": 0.3271, "step": 2539 }, { "epoch": 2.512363996043521, "grad_norm": 0.22846804776774987, "learning_rate": 9.032612678636863e-06, "loss": 0.3634, "step": 2540 }, { "epoch": 2.513353115727003, "grad_norm": 0.2461420001985178, "learning_rate": 9.014290949065591e-06, "loss": 0.3618, "step": 2541 }, { "epoch": 2.5143422354104845, "grad_norm": 0.24677056500039793, "learning_rate": 8.995969219494321e-06, "loss": 0.3474, "step": 2542 }, { "epoch": 2.5153313550939664, "grad_norm": 0.23552827235851204, "learning_rate": 8.977647489923049e-06, "loss": 0.3477, "step": 2543 }, { "epoch": 2.516320474777448, "grad_norm": 0.23544874721670853, "learning_rate": 8.959325760351777e-06, "loss": 0.3665, "step": 2544 }, { "epoch": 2.51730959446093, "grad_norm": 0.25589519603994987, "learning_rate": 8.941004030780505e-06, "loss": 0.3676, "step": 2545 }, { "epoch": 2.5182987141444113, "grad_norm": 0.22529131811822634, "learning_rate": 8.922682301209233e-06, "loss": 0.342, "step": 2546 }, { "epoch": 2.5192878338278932, "grad_norm": 0.22830805413949087, "learning_rate": 8.904360571637963e-06, "loss": 0.3228, "step": 2547 }, { "epoch": 2.5202769535113747, "grad_norm": 0.24283043796243683, "learning_rate": 8.886038842066691e-06, "loss": 0.3521, "step": 2548 }, { "epoch": 2.5212660731948566, "grad_norm": 0.23539048865709836, "learning_rate": 8.86771711249542e-06, "loss": 0.3232, "step": 2549 }, { "epoch": 2.5222551928783385, "grad_norm": 0.22416815087474565, "learning_rate": 8.84939538292415e-06, "loss": 0.3502, "step": 2550 }, { "epoch": 2.52324431256182, "grad_norm": 0.24356963148470098, "learning_rate": 8.831073653352877e-06, "loss": 0.3041, "step": 2551 }, { "epoch": 2.5242334322453015, "grad_norm": 0.2395352023996427, "learning_rate": 8.812751923781605e-06, "loss": 0.3787, "step": 2552 }, { "epoch": 2.5252225519287834, "grad_norm": 0.23644859570044596, "learning_rate": 8.794430194210334e-06, "loss": 0.3657, "step": 2553 }, { "epoch": 2.5262116716122653, "grad_norm": 0.253996407977228, "learning_rate": 8.776108464639063e-06, "loss": 0.3445, "step": 2554 }, { "epoch": 2.527200791295747, "grad_norm": 0.251282958120267, "learning_rate": 8.757786735067792e-06, "loss": 0.3536, "step": 2555 }, { "epoch": 2.5281899109792283, "grad_norm": 0.45586735138497564, "learning_rate": 8.73946500549652e-06, "loss": 0.3641, "step": 2556 }, { "epoch": 2.52917903066271, "grad_norm": 0.25052516019444043, "learning_rate": 8.721143275925248e-06, "loss": 0.3669, "step": 2557 }, { "epoch": 2.530168150346192, "grad_norm": 0.23437819091702222, "learning_rate": 8.702821546353976e-06, "loss": 0.3283, "step": 2558 }, { "epoch": 2.5311572700296736, "grad_norm": 0.22284762353121085, "learning_rate": 8.684499816782704e-06, "loss": 0.3145, "step": 2559 }, { "epoch": 2.532146389713155, "grad_norm": 0.22449486144864736, "learning_rate": 8.666178087211434e-06, "loss": 0.3553, "step": 2560 }, { "epoch": 2.533135509396637, "grad_norm": 0.2431824560198144, "learning_rate": 8.647856357640162e-06, "loss": 0.353, "step": 2561 }, { "epoch": 2.534124629080119, "grad_norm": 0.24220316126556765, "learning_rate": 8.62953462806889e-06, "loss": 0.3698, "step": 2562 }, { "epoch": 2.5351137487636004, "grad_norm": 0.24561350691928416, "learning_rate": 8.611212898497618e-06, "loss": 0.3541, "step": 2563 }, { "epoch": 2.536102868447082, "grad_norm": 0.26058782429161903, "learning_rate": 8.592891168926346e-06, "loss": 0.358, "step": 2564 }, { "epoch": 2.537091988130564, "grad_norm": 0.24483285029632448, "learning_rate": 8.574569439355074e-06, "loss": 0.3722, "step": 2565 }, { "epoch": 2.5380811078140457, "grad_norm": 0.22463143941644045, "learning_rate": 8.556247709783804e-06, "loss": 0.3344, "step": 2566 }, { "epoch": 2.539070227497527, "grad_norm": 0.25857496371914634, "learning_rate": 8.537925980212532e-06, "loss": 0.3955, "step": 2567 }, { "epoch": 2.5400593471810087, "grad_norm": 0.24794143406441915, "learning_rate": 8.51960425064126e-06, "loss": 0.3279, "step": 2568 }, { "epoch": 2.5410484668644906, "grad_norm": 0.23454478908544515, "learning_rate": 8.501282521069988e-06, "loss": 0.3387, "step": 2569 }, { "epoch": 2.5420375865479725, "grad_norm": 0.218775146778518, "learning_rate": 8.482960791498718e-06, "loss": 0.3149, "step": 2570 }, { "epoch": 2.543026706231454, "grad_norm": 0.256130200586046, "learning_rate": 8.464639061927446e-06, "loss": 0.3491, "step": 2571 }, { "epoch": 2.5440158259149355, "grad_norm": 0.21696788649308552, "learning_rate": 8.446317332356175e-06, "loss": 0.3194, "step": 2572 }, { "epoch": 2.5450049455984174, "grad_norm": 0.214921895223905, "learning_rate": 8.427995602784904e-06, "loss": 0.3066, "step": 2573 }, { "epoch": 2.5459940652818993, "grad_norm": 0.222786093431417, "learning_rate": 8.409673873213632e-06, "loss": 0.3633, "step": 2574 }, { "epoch": 2.546983184965381, "grad_norm": 0.23510985038285132, "learning_rate": 8.39135214364236e-06, "loss": 0.3647, "step": 2575 }, { "epoch": 2.5479723046488623, "grad_norm": 0.2192600372412503, "learning_rate": 8.373030414071089e-06, "loss": 0.3552, "step": 2576 }, { "epoch": 2.548961424332344, "grad_norm": 0.21452070249266664, "learning_rate": 8.354708684499817e-06, "loss": 0.3031, "step": 2577 }, { "epoch": 2.549950544015826, "grad_norm": 0.2286175179188525, "learning_rate": 8.336386954928547e-06, "loss": 0.3338, "step": 2578 }, { "epoch": 2.5509396636993076, "grad_norm": 0.24433394067425576, "learning_rate": 8.318065225357275e-06, "loss": 0.3519, "step": 2579 }, { "epoch": 2.551928783382789, "grad_norm": 0.2188927821523692, "learning_rate": 8.299743495786003e-06, "loss": 0.2997, "step": 2580 }, { "epoch": 2.552917903066271, "grad_norm": 0.23903928008474262, "learning_rate": 8.281421766214731e-06, "loss": 0.3582, "step": 2581 }, { "epoch": 2.553907022749753, "grad_norm": 0.2252037417774991, "learning_rate": 8.263100036643459e-06, "loss": 0.3158, "step": 2582 }, { "epoch": 2.5548961424332344, "grad_norm": 0.22547563235668888, "learning_rate": 8.244778307072187e-06, "loss": 0.3161, "step": 2583 }, { "epoch": 2.555885262116716, "grad_norm": 0.212708651970397, "learning_rate": 8.226456577500917e-06, "loss": 0.3021, "step": 2584 }, { "epoch": 2.5568743818001978, "grad_norm": 0.2211913667645755, "learning_rate": 8.208134847929645e-06, "loss": 0.3355, "step": 2585 }, { "epoch": 2.5578635014836797, "grad_norm": 0.2422032730001762, "learning_rate": 8.189813118358373e-06, "loss": 0.3753, "step": 2586 }, { "epoch": 2.558852621167161, "grad_norm": 0.26729432792289365, "learning_rate": 8.171491388787101e-06, "loss": 0.399, "step": 2587 }, { "epoch": 2.559841740850643, "grad_norm": 0.25427480702090216, "learning_rate": 8.15316965921583e-06, "loss": 0.3876, "step": 2588 }, { "epoch": 2.5608308605341246, "grad_norm": 0.23376302569182988, "learning_rate": 8.134847929644557e-06, "loss": 0.3499, "step": 2589 }, { "epoch": 2.5618199802176065, "grad_norm": 0.21819970135465813, "learning_rate": 8.116526200073287e-06, "loss": 0.3337, "step": 2590 }, { "epoch": 2.562809099901088, "grad_norm": 0.249529986101218, "learning_rate": 8.098204470502015e-06, "loss": 0.358, "step": 2591 }, { "epoch": 2.56379821958457, "grad_norm": 0.2365993729449896, "learning_rate": 8.079882740930744e-06, "loss": 0.3484, "step": 2592 }, { "epoch": 2.5647873392680514, "grad_norm": 0.23483393877228428, "learning_rate": 8.061561011359473e-06, "loss": 0.3566, "step": 2593 }, { "epoch": 2.5657764589515333, "grad_norm": 0.222182938428324, "learning_rate": 8.043239281788201e-06, "loss": 0.3106, "step": 2594 }, { "epoch": 2.5667655786350148, "grad_norm": 0.2094235992786249, "learning_rate": 8.02491755221693e-06, "loss": 0.3308, "step": 2595 }, { "epoch": 2.5677546983184967, "grad_norm": 0.22678718769630118, "learning_rate": 8.00659582264566e-06, "loss": 0.3054, "step": 2596 }, { "epoch": 2.568743818001978, "grad_norm": 0.2309026367438794, "learning_rate": 7.988274093074387e-06, "loss": 0.3136, "step": 2597 }, { "epoch": 2.56973293768546, "grad_norm": 0.22667618009371604, "learning_rate": 7.969952363503116e-06, "loss": 0.349, "step": 2598 }, { "epoch": 2.5707220573689415, "grad_norm": 0.21117846126384685, "learning_rate": 7.951630633931844e-06, "loss": 0.327, "step": 2599 }, { "epoch": 2.5717111770524235, "grad_norm": 0.21092844530263816, "learning_rate": 7.933308904360572e-06, "loss": 0.2968, "step": 2600 }, { "epoch": 2.572700296735905, "grad_norm": 0.22503274613024515, "learning_rate": 7.9149871747893e-06, "loss": 0.348, "step": 2601 }, { "epoch": 2.573689416419387, "grad_norm": 0.22080704154411201, "learning_rate": 7.89666544521803e-06, "loss": 0.3173, "step": 2602 }, { "epoch": 2.5746785361028683, "grad_norm": 0.20732028945319075, "learning_rate": 7.878343715646758e-06, "loss": 0.3168, "step": 2603 }, { "epoch": 2.5756676557863503, "grad_norm": 0.2380503672520683, "learning_rate": 7.860021986075486e-06, "loss": 0.3353, "step": 2604 }, { "epoch": 2.5766567754698317, "grad_norm": 0.21944010685108145, "learning_rate": 7.841700256504214e-06, "loss": 0.3154, "step": 2605 }, { "epoch": 2.5776458951533137, "grad_norm": 0.5291657134084721, "learning_rate": 7.823378526932942e-06, "loss": 0.3486, "step": 2606 }, { "epoch": 2.578635014836795, "grad_norm": 0.2287926036524713, "learning_rate": 7.805056797361672e-06, "loss": 0.3367, "step": 2607 }, { "epoch": 2.579624134520277, "grad_norm": 0.24607517669943713, "learning_rate": 7.7867350677904e-06, "loss": 0.3822, "step": 2608 }, { "epoch": 2.5806132542037585, "grad_norm": 0.30753285728195096, "learning_rate": 7.768413338219128e-06, "loss": 0.331, "step": 2609 }, { "epoch": 2.5816023738872405, "grad_norm": 0.23264272610772255, "learning_rate": 7.750091608647856e-06, "loss": 0.3177, "step": 2610 }, { "epoch": 2.582591493570722, "grad_norm": 0.24746556520732035, "learning_rate": 7.731769879076584e-06, "loss": 0.3255, "step": 2611 }, { "epoch": 2.583580613254204, "grad_norm": 0.2586316776392973, "learning_rate": 7.713448149505313e-06, "loss": 0.3122, "step": 2612 }, { "epoch": 2.5845697329376853, "grad_norm": 0.22370719892578067, "learning_rate": 7.695126419934042e-06, "loss": 0.3386, "step": 2613 }, { "epoch": 2.5855588526211672, "grad_norm": 0.22463072855787217, "learning_rate": 7.67680469036277e-06, "loss": 0.3192, "step": 2614 }, { "epoch": 2.5865479723046487, "grad_norm": 0.23719441390072404, "learning_rate": 7.658482960791499e-06, "loss": 0.3573, "step": 2615 }, { "epoch": 2.5875370919881306, "grad_norm": 0.607168367699493, "learning_rate": 7.640161231220228e-06, "loss": 0.4113, "step": 2616 }, { "epoch": 2.588526211671612, "grad_norm": 0.21084308256123116, "learning_rate": 7.621839501648956e-06, "loss": 0.3162, "step": 2617 }, { "epoch": 2.589515331355094, "grad_norm": 0.22956176268509806, "learning_rate": 7.603517772077684e-06, "loss": 0.3383, "step": 2618 }, { "epoch": 2.5905044510385755, "grad_norm": 0.22913294281991195, "learning_rate": 7.5851960425064135e-06, "loss": 0.3425, "step": 2619 }, { "epoch": 2.5914935707220574, "grad_norm": 0.23260244421957282, "learning_rate": 7.566874312935142e-06, "loss": 0.3409, "step": 2620 }, { "epoch": 2.592482690405539, "grad_norm": 0.2436083162884712, "learning_rate": 7.54855258336387e-06, "loss": 0.3385, "step": 2621 }, { "epoch": 2.593471810089021, "grad_norm": 0.21262219531288126, "learning_rate": 7.530230853792599e-06, "loss": 0.3135, "step": 2622 }, { "epoch": 2.5944609297725023, "grad_norm": 0.2368450650064414, "learning_rate": 7.511909124221327e-06, "loss": 0.385, "step": 2623 }, { "epoch": 2.5954500494559842, "grad_norm": 0.215786394243701, "learning_rate": 7.493587394650055e-06, "loss": 0.3208, "step": 2624 }, { "epoch": 2.596439169139466, "grad_norm": 0.238337529875864, "learning_rate": 7.475265665078785e-06, "loss": 0.3352, "step": 2625 }, { "epoch": 2.5974282888229476, "grad_norm": 0.20979045932789106, "learning_rate": 7.456943935507513e-06, "loss": 0.3257, "step": 2626 }, { "epoch": 2.598417408506429, "grad_norm": 0.24130139437210824, "learning_rate": 7.438622205936241e-06, "loss": 0.3548, "step": 2627 }, { "epoch": 2.599406528189911, "grad_norm": 0.2258956518218783, "learning_rate": 7.420300476364969e-06, "loss": 0.3752, "step": 2628 }, { "epoch": 2.600395647873393, "grad_norm": 0.23656689766867856, "learning_rate": 7.401978746793697e-06, "loss": 0.3919, "step": 2629 }, { "epoch": 2.6013847675568744, "grad_norm": 0.20797130671298258, "learning_rate": 7.383657017222425e-06, "loss": 0.295, "step": 2630 }, { "epoch": 2.602373887240356, "grad_norm": 0.2123407700433041, "learning_rate": 7.365335287651155e-06, "loss": 0.3318, "step": 2631 }, { "epoch": 2.603363006923838, "grad_norm": 0.25028617103387574, "learning_rate": 7.347013558079883e-06, "loss": 0.3695, "step": 2632 }, { "epoch": 2.6043521266073197, "grad_norm": 0.23738737755032072, "learning_rate": 7.328691828508611e-06, "loss": 0.3469, "step": 2633 }, { "epoch": 2.605341246290801, "grad_norm": 0.230389214849109, "learning_rate": 7.31037009893734e-06, "loss": 0.3563, "step": 2634 }, { "epoch": 2.6063303659742827, "grad_norm": 0.21597950801778004, "learning_rate": 7.292048369366068e-06, "loss": 0.3339, "step": 2635 }, { "epoch": 2.6073194856577646, "grad_norm": 0.2349603015018135, "learning_rate": 7.2737266397947965e-06, "loss": 0.3586, "step": 2636 }, { "epoch": 2.6083086053412465, "grad_norm": 0.2083573369468888, "learning_rate": 7.255404910223526e-06, "loss": 0.3004, "step": 2637 }, { "epoch": 2.609297725024728, "grad_norm": 0.2375417304044579, "learning_rate": 7.2370831806522544e-06, "loss": 0.3667, "step": 2638 }, { "epoch": 2.6102868447082095, "grad_norm": 0.21283724617762306, "learning_rate": 7.2187614510809825e-06, "loss": 0.3242, "step": 2639 }, { "epoch": 2.6112759643916914, "grad_norm": 0.24402416859387327, "learning_rate": 7.200439721509711e-06, "loss": 0.3715, "step": 2640 }, { "epoch": 2.6122650840751733, "grad_norm": 0.22429238323329195, "learning_rate": 7.182117991938439e-06, "loss": 0.3266, "step": 2641 }, { "epoch": 2.613254203758655, "grad_norm": 0.21784385253259106, "learning_rate": 7.163796262367167e-06, "loss": 0.3087, "step": 2642 }, { "epoch": 2.6142433234421363, "grad_norm": 0.22124966929792098, "learning_rate": 7.145474532795897e-06, "loss": 0.3418, "step": 2643 }, { "epoch": 2.615232443125618, "grad_norm": 0.21365105225452002, "learning_rate": 7.127152803224625e-06, "loss": 0.3277, "step": 2644 }, { "epoch": 2.6162215628091, "grad_norm": 0.22073371217981072, "learning_rate": 7.108831073653354e-06, "loss": 0.3436, "step": 2645 }, { "epoch": 2.6172106824925816, "grad_norm": 0.23269085111573848, "learning_rate": 7.090509344082082e-06, "loss": 0.3352, "step": 2646 }, { "epoch": 2.618199802176063, "grad_norm": 0.20633980623132578, "learning_rate": 7.07218761451081e-06, "loss": 0.3013, "step": 2647 }, { "epoch": 2.619188921859545, "grad_norm": 0.2241331658702146, "learning_rate": 7.053865884939538e-06, "loss": 0.3542, "step": 2648 }, { "epoch": 2.620178041543027, "grad_norm": 0.2289694050769472, "learning_rate": 7.035544155368268e-06, "loss": 0.3261, "step": 2649 }, { "epoch": 2.6211671612265084, "grad_norm": 0.22213256212288926, "learning_rate": 7.017222425796996e-06, "loss": 0.3227, "step": 2650 }, { "epoch": 2.62215628090999, "grad_norm": 0.22863504343907765, "learning_rate": 6.998900696225724e-06, "loss": 0.3628, "step": 2651 }, { "epoch": 2.623145400593472, "grad_norm": 0.2166138187893422, "learning_rate": 6.980578966654452e-06, "loss": 0.3213, "step": 2652 }, { "epoch": 2.6241345202769537, "grad_norm": 0.22061018086704073, "learning_rate": 6.96225723708318e-06, "loss": 0.362, "step": 2653 }, { "epoch": 2.625123639960435, "grad_norm": 0.24304965680542823, "learning_rate": 6.943935507511909e-06, "loss": 0.3757, "step": 2654 }, { "epoch": 2.6261127596439167, "grad_norm": 0.2166210140761748, "learning_rate": 6.925613777940638e-06, "loss": 0.3167, "step": 2655 }, { "epoch": 2.6271018793273986, "grad_norm": 0.21031218581565844, "learning_rate": 6.907292048369366e-06, "loss": 0.3391, "step": 2656 }, { "epoch": 2.6280909990108805, "grad_norm": 0.20543467160833437, "learning_rate": 6.888970318798095e-06, "loss": 0.3058, "step": 2657 }, { "epoch": 2.629080118694362, "grad_norm": 0.23810595397235643, "learning_rate": 6.8706485892268234e-06, "loss": 0.3524, "step": 2658 }, { "epoch": 2.6300692383778435, "grad_norm": 0.21740713887368684, "learning_rate": 6.8523268596555516e-06, "loss": 0.3436, "step": 2659 }, { "epoch": 2.6310583580613254, "grad_norm": 0.20995936719262198, "learning_rate": 6.83400513008428e-06, "loss": 0.3251, "step": 2660 }, { "epoch": 2.6320474777448073, "grad_norm": 0.2303528115218695, "learning_rate": 6.8156834005130095e-06, "loss": 0.3675, "step": 2661 }, { "epoch": 2.6330365974282888, "grad_norm": 0.22458667063645485, "learning_rate": 6.797361670941738e-06, "loss": 0.3362, "step": 2662 }, { "epoch": 2.6340257171117702, "grad_norm": 0.20914465008907085, "learning_rate": 6.779039941370466e-06, "loss": 0.3254, "step": 2663 }, { "epoch": 2.635014836795252, "grad_norm": 0.23181001869723658, "learning_rate": 6.760718211799194e-06, "loss": 0.3387, "step": 2664 }, { "epoch": 2.636003956478734, "grad_norm": 0.2223165254541124, "learning_rate": 6.742396482227922e-06, "loss": 0.3498, "step": 2665 }, { "epoch": 2.6369930761622156, "grad_norm": 0.22334071001886052, "learning_rate": 6.724074752656651e-06, "loss": 0.3115, "step": 2666 }, { "epoch": 2.6379821958456975, "grad_norm": 0.23398152025847294, "learning_rate": 6.70575302308538e-06, "loss": 0.3507, "step": 2667 }, { "epoch": 2.638971315529179, "grad_norm": 0.22539172159114598, "learning_rate": 6.687431293514109e-06, "loss": 0.3611, "step": 2668 }, { "epoch": 2.639960435212661, "grad_norm": 0.21097633122850826, "learning_rate": 6.669109563942837e-06, "loss": 0.3183, "step": 2669 }, { "epoch": 2.6409495548961424, "grad_norm": 0.2445964426483477, "learning_rate": 6.650787834371565e-06, "loss": 0.3399, "step": 2670 }, { "epoch": 2.6419386745796243, "grad_norm": 0.23695631235504286, "learning_rate": 6.632466104800293e-06, "loss": 0.3412, "step": 2671 }, { "epoch": 2.6429277942631058, "grad_norm": 0.2534807066261069, "learning_rate": 6.614144375229021e-06, "loss": 0.3811, "step": 2672 }, { "epoch": 2.6439169139465877, "grad_norm": 0.2352492192731664, "learning_rate": 6.595822645657751e-06, "loss": 0.3265, "step": 2673 }, { "epoch": 2.644906033630069, "grad_norm": 0.22325401378241108, "learning_rate": 6.577500916086479e-06, "loss": 0.338, "step": 2674 }, { "epoch": 2.645895153313551, "grad_norm": 0.2355023381434876, "learning_rate": 6.559179186515207e-06, "loss": 0.3439, "step": 2675 }, { "epoch": 2.6468842729970326, "grad_norm": 0.25844539764836716, "learning_rate": 6.540857456943935e-06, "loss": 0.3465, "step": 2676 }, { "epoch": 2.6478733926805145, "grad_norm": 0.20204122613105294, "learning_rate": 6.522535727372664e-06, "loss": 0.326, "step": 2677 }, { "epoch": 2.648862512363996, "grad_norm": 0.20091765692132063, "learning_rate": 6.5042139978013925e-06, "loss": 0.3146, "step": 2678 }, { "epoch": 2.649851632047478, "grad_norm": 0.2151974965408436, "learning_rate": 6.485892268230121e-06, "loss": 0.3508, "step": 2679 }, { "epoch": 2.6508407517309593, "grad_norm": 0.25164157106976864, "learning_rate": 6.46757053865885e-06, "loss": 0.3793, "step": 2680 }, { "epoch": 2.6518298714144413, "grad_norm": 0.23370693030036446, "learning_rate": 6.4492488090875785e-06, "loss": 0.4006, "step": 2681 }, { "epoch": 2.6528189910979227, "grad_norm": 0.23987905015984629, "learning_rate": 6.430927079516307e-06, "loss": 0.3613, "step": 2682 }, { "epoch": 2.6538081107814047, "grad_norm": 0.21594181062915788, "learning_rate": 6.412605349945035e-06, "loss": 0.3307, "step": 2683 }, { "epoch": 2.654797230464886, "grad_norm": 0.2290158725540692, "learning_rate": 6.394283620373763e-06, "loss": 0.3062, "step": 2684 }, { "epoch": 2.655786350148368, "grad_norm": 0.22948262260424798, "learning_rate": 6.375961890802493e-06, "loss": 0.3722, "step": 2685 }, { "epoch": 2.6567754698318495, "grad_norm": 0.22027398416939878, "learning_rate": 6.357640161231221e-06, "loss": 0.3253, "step": 2686 }, { "epoch": 2.6577645895153315, "grad_norm": 0.22047131128891237, "learning_rate": 6.339318431659949e-06, "loss": 0.3447, "step": 2687 }, { "epoch": 2.658753709198813, "grad_norm": 0.21264120578875886, "learning_rate": 6.320996702088677e-06, "loss": 0.3178, "step": 2688 }, { "epoch": 2.659742828882295, "grad_norm": 0.21403466908930086, "learning_rate": 6.302674972517406e-06, "loss": 0.3263, "step": 2689 }, { "epoch": 2.6607319485657763, "grad_norm": 0.2340128370458701, "learning_rate": 6.284353242946134e-06, "loss": 0.344, "step": 2690 }, { "epoch": 2.6617210682492582, "grad_norm": 0.21709601415825422, "learning_rate": 6.266031513374864e-06, "loss": 0.332, "step": 2691 }, { "epoch": 2.6627101879327397, "grad_norm": 0.20573493768846976, "learning_rate": 6.247709783803592e-06, "loss": 0.2896, "step": 2692 }, { "epoch": 2.6636993076162216, "grad_norm": 0.2175633594597353, "learning_rate": 6.22938805423232e-06, "loss": 0.3255, "step": 2693 }, { "epoch": 2.664688427299703, "grad_norm": 0.2208890382225565, "learning_rate": 6.211066324661048e-06, "loss": 0.3241, "step": 2694 }, { "epoch": 2.665677546983185, "grad_norm": 0.22648814414163115, "learning_rate": 6.192744595089777e-06, "loss": 0.3803, "step": 2695 }, { "epoch": 2.6666666666666665, "grad_norm": 0.2065203676472046, "learning_rate": 6.174422865518505e-06, "loss": 0.3179, "step": 2696 }, { "epoch": 2.6676557863501484, "grad_norm": 0.22119342098303224, "learning_rate": 6.156101135947233e-06, "loss": 0.3375, "step": 2697 }, { "epoch": 2.66864490603363, "grad_norm": 0.22093559612372538, "learning_rate": 6.137779406375962e-06, "loss": 0.3514, "step": 2698 }, { "epoch": 2.669634025717112, "grad_norm": 0.4155710402624548, "learning_rate": 6.1194576768046904e-06, "loss": 0.4384, "step": 2699 }, { "epoch": 2.6706231454005933, "grad_norm": 0.21197542896961055, "learning_rate": 6.101135947233419e-06, "loss": 0.3247, "step": 2700 }, { "epoch": 2.6716122650840752, "grad_norm": 0.20867372426418582, "learning_rate": 6.0828142176621475e-06, "loss": 0.3162, "step": 2701 }, { "epoch": 2.6726013847675567, "grad_norm": 0.21283459025003898, "learning_rate": 6.0644924880908765e-06, "loss": 0.3088, "step": 2702 }, { "epoch": 2.6735905044510386, "grad_norm": 0.27512316233142525, "learning_rate": 6.0461707585196046e-06, "loss": 0.3543, "step": 2703 }, { "epoch": 2.6745796241345206, "grad_norm": 0.23578251994420446, "learning_rate": 6.0278490289483335e-06, "loss": 0.3408, "step": 2704 }, { "epoch": 2.675568743818002, "grad_norm": 0.23108104721689834, "learning_rate": 6.009527299377062e-06, "loss": 0.3547, "step": 2705 }, { "epoch": 2.6765578635014835, "grad_norm": 0.22990919690448694, "learning_rate": 5.99120556980579e-06, "loss": 0.3583, "step": 2706 }, { "epoch": 2.6775469831849654, "grad_norm": 0.22608119799904844, "learning_rate": 5.972883840234519e-06, "loss": 0.3468, "step": 2707 }, { "epoch": 2.6785361028684473, "grad_norm": 0.23420424171951532, "learning_rate": 5.954562110663247e-06, "loss": 0.3711, "step": 2708 }, { "epoch": 2.679525222551929, "grad_norm": 0.22283499002319432, "learning_rate": 5.936240381091975e-06, "loss": 0.3433, "step": 2709 }, { "epoch": 2.6805143422354103, "grad_norm": 0.25165983826778154, "learning_rate": 5.917918651520704e-06, "loss": 0.3577, "step": 2710 }, { "epoch": 2.681503461918892, "grad_norm": 0.21381831548899768, "learning_rate": 5.899596921949432e-06, "loss": 0.3243, "step": 2711 }, { "epoch": 2.682492581602374, "grad_norm": 0.24859770910561865, "learning_rate": 5.881275192378161e-06, "loss": 0.3932, "step": 2712 }, { "epoch": 2.6834817012858556, "grad_norm": 0.22835997974466907, "learning_rate": 5.86295346280689e-06, "loss": 0.358, "step": 2713 }, { "epoch": 2.684470820969337, "grad_norm": 0.21603309757069702, "learning_rate": 5.844631733235618e-06, "loss": 0.3227, "step": 2714 }, { "epoch": 2.685459940652819, "grad_norm": 0.22420474785394226, "learning_rate": 5.826310003664346e-06, "loss": 0.3219, "step": 2715 }, { "epoch": 2.686449060336301, "grad_norm": 0.211435200550928, "learning_rate": 5.807988274093075e-06, "loss": 0.3027, "step": 2716 }, { "epoch": 2.6874381800197824, "grad_norm": 0.2244513347827355, "learning_rate": 5.789666544521803e-06, "loss": 0.3397, "step": 2717 }, { "epoch": 2.688427299703264, "grad_norm": 0.21717002471268115, "learning_rate": 5.771344814950531e-06, "loss": 0.3334, "step": 2718 }, { "epoch": 2.689416419386746, "grad_norm": 0.23016598747660916, "learning_rate": 5.75302308537926e-06, "loss": 0.3375, "step": 2719 }, { "epoch": 2.6904055390702277, "grad_norm": 0.21015267166960389, "learning_rate": 5.734701355807988e-06, "loss": 0.3159, "step": 2720 }, { "epoch": 2.691394658753709, "grad_norm": 0.2108694801384789, "learning_rate": 5.7163796262367165e-06, "loss": 0.3308, "step": 2721 }, { "epoch": 2.6923837784371907, "grad_norm": 0.23368408762172113, "learning_rate": 5.6980578966654455e-06, "loss": 0.3768, "step": 2722 }, { "epoch": 2.6933728981206726, "grad_norm": 0.23439430393969224, "learning_rate": 5.679736167094174e-06, "loss": 0.3017, "step": 2723 }, { "epoch": 2.6943620178041545, "grad_norm": 0.23942635675606497, "learning_rate": 5.6614144375229025e-06, "loss": 0.3689, "step": 2724 }, { "epoch": 2.695351137487636, "grad_norm": 0.22177373379695609, "learning_rate": 5.6430927079516315e-06, "loss": 0.3521, "step": 2725 }, { "epoch": 2.6963402571711175, "grad_norm": 0.21547233338756122, "learning_rate": 5.62477097838036e-06, "loss": 0.3416, "step": 2726 }, { "epoch": 2.6973293768545994, "grad_norm": 0.21774106460684997, "learning_rate": 5.606449248809088e-06, "loss": 0.3218, "step": 2727 }, { "epoch": 2.6983184965380813, "grad_norm": 0.2334321077543478, "learning_rate": 5.588127519237817e-06, "loss": 0.358, "step": 2728 }, { "epoch": 2.699307616221563, "grad_norm": 0.24139566192301815, "learning_rate": 5.569805789666545e-06, "loss": 0.3771, "step": 2729 }, { "epoch": 2.7002967359050443, "grad_norm": 0.2540431339935207, "learning_rate": 5.551484060095273e-06, "loss": 0.3657, "step": 2730 }, { "epoch": 2.701285855588526, "grad_norm": 0.5909800839666434, "learning_rate": 5.533162330524002e-06, "loss": 0.3952, "step": 2731 }, { "epoch": 2.702274975272008, "grad_norm": 0.2503607288953179, "learning_rate": 5.51484060095273e-06, "loss": 0.3793, "step": 2732 }, { "epoch": 2.7032640949554896, "grad_norm": 0.2223796649689281, "learning_rate": 5.496518871381458e-06, "loss": 0.3031, "step": 2733 }, { "epoch": 2.704253214638971, "grad_norm": 0.23538524556735074, "learning_rate": 5.478197141810187e-06, "loss": 0.3706, "step": 2734 }, { "epoch": 2.705242334322453, "grad_norm": 0.2229912934237323, "learning_rate": 5.459875412238916e-06, "loss": 0.3253, "step": 2735 }, { "epoch": 2.706231454005935, "grad_norm": 0.22679198948267829, "learning_rate": 5.441553682667644e-06, "loss": 0.3297, "step": 2736 }, { "epoch": 2.7072205736894164, "grad_norm": 0.21590619745768205, "learning_rate": 5.423231953096373e-06, "loss": 0.3169, "step": 2737 }, { "epoch": 2.708209693372898, "grad_norm": 0.2217628704155836, "learning_rate": 5.404910223525101e-06, "loss": 0.3624, "step": 2738 }, { "epoch": 2.7091988130563798, "grad_norm": 0.2169003913914285, "learning_rate": 5.386588493953829e-06, "loss": 0.3112, "step": 2739 }, { "epoch": 2.7101879327398617, "grad_norm": 0.23975530450787874, "learning_rate": 5.368266764382558e-06, "loss": 0.371, "step": 2740 }, { "epoch": 2.711177052423343, "grad_norm": 0.22760491329000107, "learning_rate": 5.349945034811286e-06, "loss": 0.3033, "step": 2741 }, { "epoch": 2.712166172106825, "grad_norm": 0.2126172357592932, "learning_rate": 5.3316233052400145e-06, "loss": 0.3459, "step": 2742 }, { "epoch": 2.7131552917903066, "grad_norm": 0.24447754746061037, "learning_rate": 5.3133015756687434e-06, "loss": 0.3579, "step": 2743 }, { "epoch": 2.7141444114737885, "grad_norm": 0.21923248450136007, "learning_rate": 5.2949798460974715e-06, "loss": 0.3553, "step": 2744 }, { "epoch": 2.71513353115727, "grad_norm": 0.23324322920894322, "learning_rate": 5.2766581165262005e-06, "loss": 0.3876, "step": 2745 }, { "epoch": 2.716122650840752, "grad_norm": 0.21651127289013336, "learning_rate": 5.258336386954929e-06, "loss": 0.3271, "step": 2746 }, { "epoch": 2.7171117705242334, "grad_norm": 0.2278600935466184, "learning_rate": 5.2400146573836576e-06, "loss": 0.3767, "step": 2747 }, { "epoch": 2.7181008902077153, "grad_norm": 0.2102279454382535, "learning_rate": 5.221692927812386e-06, "loss": 0.2996, "step": 2748 }, { "epoch": 2.7190900098911968, "grad_norm": 0.19846329542181648, "learning_rate": 5.203371198241115e-06, "loss": 0.2969, "step": 2749 }, { "epoch": 2.7200791295746787, "grad_norm": 0.23474124954742392, "learning_rate": 5.185049468669843e-06, "loss": 0.3563, "step": 2750 }, { "epoch": 2.72106824925816, "grad_norm": 0.2133239490347135, "learning_rate": 5.166727739098571e-06, "loss": 0.3625, "step": 2751 }, { "epoch": 2.722057368941642, "grad_norm": 0.2358371050129765, "learning_rate": 5.1484060095273e-06, "loss": 0.3468, "step": 2752 }, { "epoch": 2.7230464886251236, "grad_norm": 0.20255887262973588, "learning_rate": 5.130084279956028e-06, "loss": 0.3045, "step": 2753 }, { "epoch": 2.7240356083086055, "grad_norm": 0.22868689020977842, "learning_rate": 5.111762550384756e-06, "loss": 0.3525, "step": 2754 }, { "epoch": 2.725024727992087, "grad_norm": 0.24113962478912296, "learning_rate": 5.093440820813485e-06, "loss": 0.3564, "step": 2755 }, { "epoch": 2.726013847675569, "grad_norm": 0.2442213322729859, "learning_rate": 5.075119091242213e-06, "loss": 0.3272, "step": 2756 }, { "epoch": 2.7270029673590503, "grad_norm": 0.22564109275269484, "learning_rate": 5.056797361670942e-06, "loss": 0.3294, "step": 2757 }, { "epoch": 2.7279920870425323, "grad_norm": 0.2187439459162259, "learning_rate": 5.038475632099671e-06, "loss": 0.3428, "step": 2758 }, { "epoch": 2.7289812067260137, "grad_norm": 0.22198247455708378, "learning_rate": 5.020153902528399e-06, "loss": 0.3523, "step": 2759 }, { "epoch": 2.7299703264094957, "grad_norm": 0.20648216241698295, "learning_rate": 5.001832172957127e-06, "loss": 0.2998, "step": 2760 }, { "epoch": 2.730959446092977, "grad_norm": 0.21591111811349886, "learning_rate": 4.983510443385856e-06, "loss": 0.3502, "step": 2761 }, { "epoch": 2.731948565776459, "grad_norm": 0.20875229932327685, "learning_rate": 4.965188713814584e-06, "loss": 0.319, "step": 2762 }, { "epoch": 2.7329376854599405, "grad_norm": 0.2309618563362465, "learning_rate": 4.9468669842433124e-06, "loss": 0.3301, "step": 2763 }, { "epoch": 2.7339268051434225, "grad_norm": 0.21705046113121196, "learning_rate": 4.928545254672041e-06, "loss": 0.3328, "step": 2764 }, { "epoch": 2.734915924826904, "grad_norm": 0.2027787922422683, "learning_rate": 4.9102235251007695e-06, "loss": 0.3022, "step": 2765 }, { "epoch": 2.735905044510386, "grad_norm": 0.2133690055551515, "learning_rate": 4.891901795529498e-06, "loss": 0.3357, "step": 2766 }, { "epoch": 2.7368941641938673, "grad_norm": 0.22559371081701649, "learning_rate": 4.873580065958227e-06, "loss": 0.3545, "step": 2767 }, { "epoch": 2.7378832838773492, "grad_norm": 0.21203557203334927, "learning_rate": 4.8552583363869555e-06, "loss": 0.3328, "step": 2768 }, { "epoch": 2.7388724035608307, "grad_norm": 0.21294747265250044, "learning_rate": 4.836936606815684e-06, "loss": 0.332, "step": 2769 }, { "epoch": 2.7398615232443126, "grad_norm": 0.21123677919400954, "learning_rate": 4.818614877244413e-06, "loss": 0.3414, "step": 2770 }, { "epoch": 2.740850642927794, "grad_norm": 0.21659310977778334, "learning_rate": 4.800293147673141e-06, "loss": 0.3264, "step": 2771 }, { "epoch": 2.741839762611276, "grad_norm": 0.22960667507714003, "learning_rate": 4.781971418101869e-06, "loss": 0.374, "step": 2772 }, { "epoch": 2.7428288822947575, "grad_norm": 0.21339735395164217, "learning_rate": 4.763649688530598e-06, "loss": 0.3446, "step": 2773 }, { "epoch": 2.7438180019782394, "grad_norm": 0.23992675800975746, "learning_rate": 4.745327958959326e-06, "loss": 0.3627, "step": 2774 }, { "epoch": 2.744807121661721, "grad_norm": 0.21653441504113724, "learning_rate": 4.727006229388054e-06, "loss": 0.3292, "step": 2775 }, { "epoch": 2.745796241345203, "grad_norm": 0.20765781658155638, "learning_rate": 4.708684499816783e-06, "loss": 0.3369, "step": 2776 }, { "epoch": 2.7467853610286843, "grad_norm": 0.2137923966682356, "learning_rate": 4.690362770245511e-06, "loss": 0.3316, "step": 2777 }, { "epoch": 2.7477744807121662, "grad_norm": 0.22951511263981078, "learning_rate": 4.672041040674239e-06, "loss": 0.3232, "step": 2778 }, { "epoch": 2.7487636003956477, "grad_norm": 0.21668420363059104, "learning_rate": 4.653719311102968e-06, "loss": 0.3133, "step": 2779 }, { "epoch": 2.7497527200791296, "grad_norm": 0.20870298094145542, "learning_rate": 4.635397581531697e-06, "loss": 0.3079, "step": 2780 }, { "epoch": 2.750741839762611, "grad_norm": 0.21695592576488162, "learning_rate": 4.617075851960425e-06, "loss": 0.3365, "step": 2781 }, { "epoch": 2.751730959446093, "grad_norm": 0.23452800703138157, "learning_rate": 4.598754122389154e-06, "loss": 0.3605, "step": 2782 }, { "epoch": 2.752720079129575, "grad_norm": 0.22386479424333205, "learning_rate": 4.580432392817882e-06, "loss": 0.3766, "step": 2783 }, { "epoch": 2.7537091988130564, "grad_norm": 0.2500059904791562, "learning_rate": 4.56211066324661e-06, "loss": 0.3552, "step": 2784 }, { "epoch": 2.754698318496538, "grad_norm": 0.2120523623817133, "learning_rate": 4.543788933675339e-06, "loss": 0.3178, "step": 2785 }, { "epoch": 2.75568743818002, "grad_norm": 0.2151292399138165, "learning_rate": 4.5254672041040675e-06, "loss": 0.3671, "step": 2786 }, { "epoch": 2.7566765578635017, "grad_norm": 0.21872770644925504, "learning_rate": 4.507145474532796e-06, "loss": 0.3641, "step": 2787 }, { "epoch": 2.757665677546983, "grad_norm": 0.24772664908497757, "learning_rate": 4.4888237449615246e-06, "loss": 0.3734, "step": 2788 }, { "epoch": 2.7586547972304647, "grad_norm": 0.2642668933133229, "learning_rate": 4.470502015390253e-06, "loss": 0.3684, "step": 2789 }, { "epoch": 2.7596439169139466, "grad_norm": 0.2221777432007733, "learning_rate": 4.452180285818982e-06, "loss": 0.349, "step": 2790 }, { "epoch": 2.7606330365974285, "grad_norm": 0.2196073440373851, "learning_rate": 4.43385855624771e-06, "loss": 0.3489, "step": 2791 }, { "epoch": 2.76162215628091, "grad_norm": 0.23391467333346672, "learning_rate": 4.415536826676439e-06, "loss": 0.3598, "step": 2792 }, { "epoch": 2.7626112759643915, "grad_norm": 0.2115533472124657, "learning_rate": 4.397215097105167e-06, "loss": 0.3081, "step": 2793 }, { "epoch": 2.7636003956478734, "grad_norm": 0.2271678820385135, "learning_rate": 4.378893367533896e-06, "loss": 0.3716, "step": 2794 }, { "epoch": 2.7645895153313553, "grad_norm": 0.23767597252465578, "learning_rate": 4.360571637962624e-06, "loss": 0.3591, "step": 2795 }, { "epoch": 2.765578635014837, "grad_norm": 0.2066536302237579, "learning_rate": 4.342249908391352e-06, "loss": 0.3413, "step": 2796 }, { "epoch": 2.7665677546983183, "grad_norm": 0.21845630072729733, "learning_rate": 4.323928178820081e-06, "loss": 0.3577, "step": 2797 }, { "epoch": 2.7675568743818, "grad_norm": 0.21779485602289012, "learning_rate": 4.305606449248809e-06, "loss": 0.3468, "step": 2798 }, { "epoch": 2.768545994065282, "grad_norm": 0.22232401786465894, "learning_rate": 4.287284719677537e-06, "loss": 0.3144, "step": 2799 }, { "epoch": 2.7695351137487636, "grad_norm": 0.23008435815779882, "learning_rate": 4.268962990106266e-06, "loss": 0.3433, "step": 2800 }, { "epoch": 2.770524233432245, "grad_norm": 0.2226756211496331, "learning_rate": 4.250641260534994e-06, "loss": 0.375, "step": 2801 }, { "epoch": 2.771513353115727, "grad_norm": 0.21695887700850297, "learning_rate": 4.232319530963723e-06, "loss": 0.3465, "step": 2802 }, { "epoch": 2.772502472799209, "grad_norm": 0.22223208742172174, "learning_rate": 4.213997801392452e-06, "loss": 0.2978, "step": 2803 }, { "epoch": 2.7734915924826904, "grad_norm": 0.2170797804111916, "learning_rate": 4.19567607182118e-06, "loss": 0.3213, "step": 2804 }, { "epoch": 2.774480712166172, "grad_norm": 0.22027515254777197, "learning_rate": 4.177354342249908e-06, "loss": 0.3443, "step": 2805 }, { "epoch": 2.775469831849654, "grad_norm": 0.22207428166028975, "learning_rate": 4.159032612678637e-06, "loss": 0.3573, "step": 2806 }, { "epoch": 2.7764589515331357, "grad_norm": 0.2087620444374386, "learning_rate": 4.1407108831073654e-06, "loss": 0.3414, "step": 2807 }, { "epoch": 2.777448071216617, "grad_norm": 0.2078913845907797, "learning_rate": 4.1223891535360936e-06, "loss": 0.3213, "step": 2808 }, { "epoch": 2.7784371909000987, "grad_norm": 0.23077801939274703, "learning_rate": 4.1040674239648225e-06, "loss": 0.3524, "step": 2809 }, { "epoch": 2.7794263105835806, "grad_norm": 0.21731853730142003, "learning_rate": 4.085745694393551e-06, "loss": 0.3441, "step": 2810 }, { "epoch": 2.7804154302670625, "grad_norm": 0.216573556930499, "learning_rate": 4.067423964822279e-06, "loss": 0.3576, "step": 2811 }, { "epoch": 2.781404549950544, "grad_norm": 0.21120665151959334, "learning_rate": 4.049102235251008e-06, "loss": 0.3431, "step": 2812 }, { "epoch": 2.7823936696340255, "grad_norm": 0.23307858510646606, "learning_rate": 4.030780505679737e-06, "loss": 0.3603, "step": 2813 }, { "epoch": 2.7833827893175074, "grad_norm": 0.2107086413941475, "learning_rate": 4.012458776108465e-06, "loss": 0.3332, "step": 2814 }, { "epoch": 2.7843719090009893, "grad_norm": 0.23230553012383037, "learning_rate": 3.994137046537194e-06, "loss": 0.3313, "step": 2815 }, { "epoch": 2.7853610286844708, "grad_norm": 0.25073644328468253, "learning_rate": 3.975815316965922e-06, "loss": 0.4522, "step": 2816 }, { "epoch": 2.7863501483679523, "grad_norm": 0.22850963515675785, "learning_rate": 3.95749358739465e-06, "loss": 0.3318, "step": 2817 }, { "epoch": 2.787339268051434, "grad_norm": 0.22594036641456755, "learning_rate": 3.939171857823379e-06, "loss": 0.3078, "step": 2818 }, { "epoch": 2.788328387734916, "grad_norm": 0.23726950060282387, "learning_rate": 3.920850128252107e-06, "loss": 0.3667, "step": 2819 }, { "epoch": 2.7893175074183976, "grad_norm": 0.21765453645873728, "learning_rate": 3.902528398680836e-06, "loss": 0.3253, "step": 2820 }, { "epoch": 2.7903066271018795, "grad_norm": 0.21731756160867208, "learning_rate": 3.884206669109564e-06, "loss": 0.3354, "step": 2821 }, { "epoch": 2.791295746785361, "grad_norm": 0.20040051432164183, "learning_rate": 3.865884939538292e-06, "loss": 0.3006, "step": 2822 }, { "epoch": 2.792284866468843, "grad_norm": 0.22186912403842526, "learning_rate": 3.847563209967021e-06, "loss": 0.3171, "step": 2823 }, { "epoch": 2.7932739861523244, "grad_norm": 0.22337977682594518, "learning_rate": 3.829241480395749e-06, "loss": 0.3805, "step": 2824 }, { "epoch": 2.7942631058358063, "grad_norm": 0.22332412301847868, "learning_rate": 3.810919750824478e-06, "loss": 0.3363, "step": 2825 }, { "epoch": 2.7952522255192878, "grad_norm": 0.2184887090680327, "learning_rate": 3.7925980212532068e-06, "loss": 0.3253, "step": 2826 }, { "epoch": 2.7962413452027697, "grad_norm": 0.2326535028536087, "learning_rate": 3.774276291681935e-06, "loss": 0.3413, "step": 2827 }, { "epoch": 2.797230464886251, "grad_norm": 0.22537246867937816, "learning_rate": 3.7559545621106634e-06, "loss": 0.3787, "step": 2828 }, { "epoch": 2.798219584569733, "grad_norm": 0.21512167988752373, "learning_rate": 3.7376328325393924e-06, "loss": 0.3419, "step": 2829 }, { "epoch": 2.7992087042532146, "grad_norm": 0.21070682115918318, "learning_rate": 3.7193111029681205e-06, "loss": 0.3435, "step": 2830 }, { "epoch": 2.8001978239366965, "grad_norm": 0.23029319903291864, "learning_rate": 3.7009893733968486e-06, "loss": 0.3486, "step": 2831 }, { "epoch": 2.801186943620178, "grad_norm": 0.2138444420546789, "learning_rate": 3.6826676438255776e-06, "loss": 0.3415, "step": 2832 }, { "epoch": 2.80217606330366, "grad_norm": 0.20357209160670728, "learning_rate": 3.6643459142543057e-06, "loss": 0.3293, "step": 2833 }, { "epoch": 2.8031651829871413, "grad_norm": 0.25026208901493, "learning_rate": 3.646024184683034e-06, "loss": 0.3354, "step": 2834 }, { "epoch": 2.8041543026706233, "grad_norm": 0.2011884823421506, "learning_rate": 3.627702455111763e-06, "loss": 0.3096, "step": 2835 }, { "epoch": 2.8051434223541047, "grad_norm": 0.2394174392476079, "learning_rate": 3.6093807255404913e-06, "loss": 0.37, "step": 2836 }, { "epoch": 2.8061325420375867, "grad_norm": 0.2255311976493184, "learning_rate": 3.5910589959692194e-06, "loss": 0.3478, "step": 2837 }, { "epoch": 2.807121661721068, "grad_norm": 0.24237047406156542, "learning_rate": 3.5727372663979483e-06, "loss": 0.3575, "step": 2838 }, { "epoch": 2.80811078140455, "grad_norm": 0.21059337790432422, "learning_rate": 3.554415536826677e-06, "loss": 0.3284, "step": 2839 }, { "epoch": 2.8090999010880315, "grad_norm": 0.20503847170160883, "learning_rate": 3.536093807255405e-06, "loss": 0.3233, "step": 2840 }, { "epoch": 2.8100890207715135, "grad_norm": 0.21119961563953477, "learning_rate": 3.517772077684134e-06, "loss": 0.3272, "step": 2841 }, { "epoch": 2.811078140454995, "grad_norm": 0.19978901112842018, "learning_rate": 3.499450348112862e-06, "loss": 0.3026, "step": 2842 }, { "epoch": 2.812067260138477, "grad_norm": 0.21915361223504046, "learning_rate": 3.48112861854159e-06, "loss": 0.3428, "step": 2843 }, { "epoch": 2.8130563798219583, "grad_norm": 0.2235259457896506, "learning_rate": 3.462806888970319e-06, "loss": 0.3043, "step": 2844 }, { "epoch": 2.8140454995054403, "grad_norm": 0.2178345130349941, "learning_rate": 3.4444851593990477e-06, "loss": 0.3428, "step": 2845 }, { "epoch": 2.8150346191889217, "grad_norm": 0.21372597462198137, "learning_rate": 3.4261634298277758e-06, "loss": 0.2926, "step": 2846 }, { "epoch": 2.8160237388724036, "grad_norm": 0.22720651208550413, "learning_rate": 3.4078417002565047e-06, "loss": 0.3482, "step": 2847 }, { "epoch": 2.817012858555885, "grad_norm": 0.22006740899272062, "learning_rate": 3.389519970685233e-06, "loss": 0.3427, "step": 2848 }, { "epoch": 2.818001978239367, "grad_norm": 0.2159529003979577, "learning_rate": 3.371198241113961e-06, "loss": 0.3423, "step": 2849 }, { "epoch": 2.8189910979228485, "grad_norm": 0.20214518327063988, "learning_rate": 3.35287651154269e-06, "loss": 0.3143, "step": 2850 }, { "epoch": 2.8199802176063304, "grad_norm": 0.21793790452801143, "learning_rate": 3.3345547819714185e-06, "loss": 0.3113, "step": 2851 }, { "epoch": 2.820969337289812, "grad_norm": 0.20335863728522915, "learning_rate": 3.3162330524001466e-06, "loss": 0.3059, "step": 2852 }, { "epoch": 2.821958456973294, "grad_norm": 0.20270595996542862, "learning_rate": 3.2979113228288755e-06, "loss": 0.3272, "step": 2853 }, { "epoch": 2.8229475766567753, "grad_norm": 1.1077539805296455, "learning_rate": 3.2795895932576036e-06, "loss": 0.3856, "step": 2854 }, { "epoch": 2.8239366963402572, "grad_norm": 0.24132309930564733, "learning_rate": 3.261267863686332e-06, "loss": 0.3957, "step": 2855 }, { "epoch": 2.8249258160237387, "grad_norm": 0.2275466649899499, "learning_rate": 3.2429461341150607e-06, "loss": 0.3769, "step": 2856 }, { "epoch": 2.8259149357072206, "grad_norm": 0.25008670282611023, "learning_rate": 3.2246244045437892e-06, "loss": 0.35, "step": 2857 }, { "epoch": 2.826904055390702, "grad_norm": 0.22773541395218408, "learning_rate": 3.2063026749725174e-06, "loss": 0.3784, "step": 2858 }, { "epoch": 2.827893175074184, "grad_norm": 0.20609422624498253, "learning_rate": 3.1879809454012463e-06, "loss": 0.3239, "step": 2859 }, { "epoch": 2.8288822947576655, "grad_norm": 0.20460207000166866, "learning_rate": 3.1696592158299744e-06, "loss": 0.3215, "step": 2860 }, { "epoch": 2.8298714144411474, "grad_norm": 0.23129338204484665, "learning_rate": 3.151337486258703e-06, "loss": 0.3938, "step": 2861 }, { "epoch": 2.8308605341246293, "grad_norm": 0.22362717643248406, "learning_rate": 3.133015756687432e-06, "loss": 0.3617, "step": 2862 }, { "epoch": 2.831849653808111, "grad_norm": 0.21117103548385116, "learning_rate": 3.11469402711616e-06, "loss": 0.3237, "step": 2863 }, { "epoch": 2.8328387734915923, "grad_norm": 0.20671124421870285, "learning_rate": 3.0963722975448886e-06, "loss": 0.3112, "step": 2864 }, { "epoch": 2.833827893175074, "grad_norm": 0.22217972694620672, "learning_rate": 3.0780505679736167e-06, "loss": 0.3634, "step": 2865 }, { "epoch": 2.834817012858556, "grad_norm": 0.2123227501555754, "learning_rate": 3.0597288384023452e-06, "loss": 0.3181, "step": 2866 }, { "epoch": 2.8358061325420376, "grad_norm": 0.20096780688164737, "learning_rate": 3.0414071088310737e-06, "loss": 0.3491, "step": 2867 }, { "epoch": 2.836795252225519, "grad_norm": 0.21924673434018688, "learning_rate": 3.0230853792598023e-06, "loss": 0.3443, "step": 2868 }, { "epoch": 2.837784371909001, "grad_norm": 0.2286827845733992, "learning_rate": 3.004763649688531e-06, "loss": 0.3668, "step": 2869 }, { "epoch": 2.838773491592483, "grad_norm": 0.21092705025428599, "learning_rate": 2.9864419201172594e-06, "loss": 0.338, "step": 2870 }, { "epoch": 2.8397626112759644, "grad_norm": 0.2148124154519086, "learning_rate": 2.9681201905459875e-06, "loss": 0.3545, "step": 2871 }, { "epoch": 2.840751730959446, "grad_norm": 0.2133227511589604, "learning_rate": 2.949798460974716e-06, "loss": 0.3347, "step": 2872 }, { "epoch": 2.841740850642928, "grad_norm": 0.2208561637151365, "learning_rate": 2.931476731403445e-06, "loss": 0.3511, "step": 2873 }, { "epoch": 2.8427299703264097, "grad_norm": 0.2211175652037006, "learning_rate": 2.913155001832173e-06, "loss": 0.3802, "step": 2874 }, { "epoch": 2.843719090009891, "grad_norm": 0.22087087502836644, "learning_rate": 2.8948332722609016e-06, "loss": 0.3668, "step": 2875 }, { "epoch": 2.8447082096933727, "grad_norm": 0.22955939394628602, "learning_rate": 2.87651154268963e-06, "loss": 0.3035, "step": 2876 }, { "epoch": 2.8456973293768546, "grad_norm": 0.20911592349861238, "learning_rate": 2.8581898131183583e-06, "loss": 0.3487, "step": 2877 }, { "epoch": 2.8466864490603365, "grad_norm": 0.20740229129970458, "learning_rate": 2.839868083547087e-06, "loss": 0.2983, "step": 2878 }, { "epoch": 2.847675568743818, "grad_norm": 0.1982895805702466, "learning_rate": 2.8215463539758157e-06, "loss": 0.29, "step": 2879 }, { "epoch": 2.8486646884272995, "grad_norm": 0.21506799683461428, "learning_rate": 2.803224624404544e-06, "loss": 0.3596, "step": 2880 }, { "epoch": 2.8496538081107814, "grad_norm": 0.21618734491043085, "learning_rate": 2.7849028948332724e-06, "loss": 0.3111, "step": 2881 }, { "epoch": 2.8506429277942633, "grad_norm": 0.2123248042207484, "learning_rate": 2.766581165262001e-06, "loss": 0.3545, "step": 2882 }, { "epoch": 2.851632047477745, "grad_norm": 0.22214805828542464, "learning_rate": 2.748259435690729e-06, "loss": 0.358, "step": 2883 }, { "epoch": 2.8526211671612263, "grad_norm": 0.2004977051100376, "learning_rate": 2.729937706119458e-06, "loss": 0.3267, "step": 2884 }, { "epoch": 2.853610286844708, "grad_norm": 0.21295550797121635, "learning_rate": 2.7116159765481865e-06, "loss": 0.3417, "step": 2885 }, { "epoch": 2.85459940652819, "grad_norm": 0.2103516015542619, "learning_rate": 2.6932942469769146e-06, "loss": 0.3276, "step": 2886 }, { "epoch": 2.8555885262116716, "grad_norm": 0.20954170278454848, "learning_rate": 2.674972517405643e-06, "loss": 0.3189, "step": 2887 }, { "epoch": 2.856577645895153, "grad_norm": 0.20360676369415176, "learning_rate": 2.6566507878343717e-06, "loss": 0.319, "step": 2888 }, { "epoch": 2.857566765578635, "grad_norm": 0.21277689756209228, "learning_rate": 2.6383290582631003e-06, "loss": 0.3326, "step": 2889 }, { "epoch": 2.858555885262117, "grad_norm": 0.21752596509030658, "learning_rate": 2.6200073286918288e-06, "loss": 0.3731, "step": 2890 }, { "epoch": 2.8595450049455984, "grad_norm": 0.2138648178065751, "learning_rate": 2.6016855991205573e-06, "loss": 0.3556, "step": 2891 }, { "epoch": 2.86053412462908, "grad_norm": 0.21421289059170068, "learning_rate": 2.5833638695492854e-06, "loss": 0.3449, "step": 2892 }, { "epoch": 2.8615232443125618, "grad_norm": 0.22802713240073696, "learning_rate": 2.565042139978014e-06, "loss": 0.339, "step": 2893 }, { "epoch": 2.8625123639960437, "grad_norm": 0.20891785110011957, "learning_rate": 2.5467204104067425e-06, "loss": 0.3272, "step": 2894 }, { "epoch": 2.863501483679525, "grad_norm": 0.2154505874086274, "learning_rate": 2.528398680835471e-06, "loss": 0.3493, "step": 2895 }, { "epoch": 2.8644906033630066, "grad_norm": 0.21116137690722792, "learning_rate": 2.5100769512641996e-06, "loss": 0.3261, "step": 2896 }, { "epoch": 2.8654797230464886, "grad_norm": 0.20872464139224442, "learning_rate": 2.491755221692928e-06, "loss": 0.3091, "step": 2897 }, { "epoch": 2.8664688427299705, "grad_norm": 0.22172791728030142, "learning_rate": 2.4734334921216562e-06, "loss": 0.3674, "step": 2898 }, { "epoch": 2.867457962413452, "grad_norm": 0.22760995786242147, "learning_rate": 2.4551117625503848e-06, "loss": 0.3728, "step": 2899 }, { "epoch": 2.868447082096934, "grad_norm": 0.22014957904362617, "learning_rate": 2.4367900329791133e-06, "loss": 0.3356, "step": 2900 }, { "epoch": 2.8694362017804154, "grad_norm": 0.21304917465749867, "learning_rate": 2.418468303407842e-06, "loss": 0.3579, "step": 2901 }, { "epoch": 2.8704253214638973, "grad_norm": 0.19788363376546905, "learning_rate": 2.4001465738365704e-06, "loss": 0.3202, "step": 2902 }, { "epoch": 2.8714144411473788, "grad_norm": 0.21988418201730414, "learning_rate": 2.381824844265299e-06, "loss": 0.3181, "step": 2903 }, { "epoch": 2.8724035608308607, "grad_norm": 0.20738796103864413, "learning_rate": 2.363503114694027e-06, "loss": 0.3477, "step": 2904 }, { "epoch": 2.873392680514342, "grad_norm": 0.21762364979569407, "learning_rate": 2.3451813851227555e-06, "loss": 0.3439, "step": 2905 }, { "epoch": 2.874381800197824, "grad_norm": 0.2143578729116185, "learning_rate": 2.326859655551484e-06, "loss": 0.3348, "step": 2906 }, { "epoch": 2.8753709198813056, "grad_norm": 0.232885047628153, "learning_rate": 2.3085379259802126e-06, "loss": 0.3748, "step": 2907 }, { "epoch": 2.8763600395647875, "grad_norm": 0.21077048066014784, "learning_rate": 2.290216196408941e-06, "loss": 0.3429, "step": 2908 }, { "epoch": 2.877349159248269, "grad_norm": 0.21001095217110033, "learning_rate": 2.2718944668376697e-06, "loss": 0.3032, "step": 2909 }, { "epoch": 2.878338278931751, "grad_norm": 0.2166268520185779, "learning_rate": 2.253572737266398e-06, "loss": 0.3875, "step": 2910 }, { "epoch": 2.8793273986152323, "grad_norm": 0.21898528679644236, "learning_rate": 2.2352510076951263e-06, "loss": 0.3256, "step": 2911 }, { "epoch": 2.8803165182987143, "grad_norm": 0.22735390510825595, "learning_rate": 2.216929278123855e-06, "loss": 0.4083, "step": 2912 }, { "epoch": 2.8813056379821957, "grad_norm": 0.23062531916202664, "learning_rate": 2.1986075485525834e-06, "loss": 0.372, "step": 2913 }, { "epoch": 2.8822947576656777, "grad_norm": 0.21477846290717173, "learning_rate": 2.180285818981312e-06, "loss": 0.3271, "step": 2914 }, { "epoch": 2.883283877349159, "grad_norm": 0.2117459683443643, "learning_rate": 2.1619640894100405e-06, "loss": 0.3577, "step": 2915 }, { "epoch": 2.884272997032641, "grad_norm": 0.21937476541837353, "learning_rate": 2.1436423598387686e-06, "loss": 0.3423, "step": 2916 }, { "epoch": 2.8852621167161225, "grad_norm": 0.1976009681830841, "learning_rate": 2.125320630267497e-06, "loss": 0.3253, "step": 2917 }, { "epoch": 2.8862512363996045, "grad_norm": 0.21302952806939138, "learning_rate": 2.106998900696226e-06, "loss": 0.3305, "step": 2918 }, { "epoch": 2.887240356083086, "grad_norm": 0.206815293905744, "learning_rate": 2.088677171124954e-06, "loss": 0.3496, "step": 2919 }, { "epoch": 2.888229475766568, "grad_norm": 0.1995894143318209, "learning_rate": 2.0703554415536827e-06, "loss": 0.3112, "step": 2920 }, { "epoch": 2.8892185954500493, "grad_norm": 0.19955543207743653, "learning_rate": 2.0520337119824113e-06, "loss": 0.3113, "step": 2921 }, { "epoch": 2.8902077151335313, "grad_norm": 0.2158562754144166, "learning_rate": 2.0337119824111394e-06, "loss": 0.3698, "step": 2922 }, { "epoch": 2.8911968348170127, "grad_norm": 0.20864063973766814, "learning_rate": 2.0153902528398683e-06, "loss": 0.3441, "step": 2923 }, { "epoch": 2.8921859545004946, "grad_norm": 0.2119247973375731, "learning_rate": 1.997068523268597e-06, "loss": 0.3437, "step": 2924 }, { "epoch": 2.893175074183976, "grad_norm": 0.2232073700071952, "learning_rate": 1.978746793697325e-06, "loss": 0.3929, "step": 2925 }, { "epoch": 2.894164193867458, "grad_norm": 0.20286778897421978, "learning_rate": 1.9604250641260535e-06, "loss": 0.3237, "step": 2926 }, { "epoch": 2.8951533135509395, "grad_norm": 0.2092640542037701, "learning_rate": 1.942103334554782e-06, "loss": 0.2943, "step": 2927 }, { "epoch": 2.8961424332344214, "grad_norm": 0.2088218793092618, "learning_rate": 1.9237816049835106e-06, "loss": 0.3332, "step": 2928 }, { "epoch": 2.897131552917903, "grad_norm": 0.20119421841154572, "learning_rate": 1.905459875412239e-06, "loss": 0.319, "step": 2929 }, { "epoch": 2.898120672601385, "grad_norm": 0.20219374342024857, "learning_rate": 1.8871381458409674e-06, "loss": 0.3187, "step": 2930 }, { "epoch": 2.8991097922848663, "grad_norm": 0.19777287722260406, "learning_rate": 1.8688164162696962e-06, "loss": 0.3202, "step": 2931 }, { "epoch": 2.9000989119683482, "grad_norm": 0.20427420158126633, "learning_rate": 1.8504946866984243e-06, "loss": 0.322, "step": 2932 }, { "epoch": 2.9010880316518297, "grad_norm": 0.21551527284059774, "learning_rate": 1.8321729571271528e-06, "loss": 0.3069, "step": 2933 }, { "epoch": 2.9020771513353116, "grad_norm": 0.199753848958888, "learning_rate": 1.8138512275558816e-06, "loss": 0.291, "step": 2934 }, { "epoch": 2.903066271018793, "grad_norm": 0.2313478126214845, "learning_rate": 1.7955294979846097e-06, "loss": 0.368, "step": 2935 }, { "epoch": 2.904055390702275, "grad_norm": 0.20699383705522323, "learning_rate": 1.7772077684133384e-06, "loss": 0.3402, "step": 2936 }, { "epoch": 2.905044510385757, "grad_norm": 0.2187505761089255, "learning_rate": 1.758886038842067e-06, "loss": 0.3623, "step": 2937 }, { "epoch": 2.9060336300692384, "grad_norm": 0.19345606798141826, "learning_rate": 1.740564309270795e-06, "loss": 0.3104, "step": 2938 }, { "epoch": 2.90702274975272, "grad_norm": 0.21864657860270537, "learning_rate": 1.7222425796995238e-06, "loss": 0.3415, "step": 2939 }, { "epoch": 2.908011869436202, "grad_norm": 0.19683732715540603, "learning_rate": 1.7039208501282524e-06, "loss": 0.2874, "step": 2940 }, { "epoch": 2.9090009891196837, "grad_norm": 0.21266519134677597, "learning_rate": 1.6855991205569805e-06, "loss": 0.3586, "step": 2941 }, { "epoch": 2.909990108803165, "grad_norm": 0.21398924276198206, "learning_rate": 1.6672773909857092e-06, "loss": 0.3229, "step": 2942 }, { "epoch": 2.9109792284866467, "grad_norm": 0.22495179099039847, "learning_rate": 1.6489556614144378e-06, "loss": 0.3834, "step": 2943 }, { "epoch": 2.9119683481701286, "grad_norm": 0.2024056953522565, "learning_rate": 1.630633931843166e-06, "loss": 0.3348, "step": 2944 }, { "epoch": 2.9129574678536105, "grad_norm": 0.21753481027936725, "learning_rate": 1.6123122022718946e-06, "loss": 0.3871, "step": 2945 }, { "epoch": 2.913946587537092, "grad_norm": 0.23414035730320035, "learning_rate": 1.5939904727006232e-06, "loss": 0.403, "step": 2946 }, { "epoch": 2.9149357072205735, "grad_norm": 0.20610158192122174, "learning_rate": 1.5756687431293515e-06, "loss": 0.3576, "step": 2947 }, { "epoch": 2.9159248269040554, "grad_norm": 0.2264080372941922, "learning_rate": 1.55734701355808e-06, "loss": 0.3619, "step": 2948 }, { "epoch": 2.9169139465875373, "grad_norm": 0.20401730729882686, "learning_rate": 1.5390252839868083e-06, "loss": 0.3416, "step": 2949 }, { "epoch": 2.917903066271019, "grad_norm": 0.20630091994141, "learning_rate": 1.5207035544155369e-06, "loss": 0.3122, "step": 2950 }, { "epoch": 2.9188921859545003, "grad_norm": 0.2284728589041083, "learning_rate": 1.5023818248442654e-06, "loss": 0.3559, "step": 2951 }, { "epoch": 2.919881305637982, "grad_norm": 0.21523792128505556, "learning_rate": 1.4840600952729937e-06, "loss": 0.3322, "step": 2952 }, { "epoch": 2.920870425321464, "grad_norm": 0.21968342654749495, "learning_rate": 1.4657383657017225e-06, "loss": 0.3689, "step": 2953 }, { "epoch": 2.9218595450049456, "grad_norm": 0.22690637293425686, "learning_rate": 1.4474166361304508e-06, "loss": 0.3335, "step": 2954 }, { "epoch": 2.922848664688427, "grad_norm": 0.23072015845383623, "learning_rate": 1.4290949065591791e-06, "loss": 0.383, "step": 2955 }, { "epoch": 2.923837784371909, "grad_norm": 0.2227436812072427, "learning_rate": 1.4107731769879079e-06, "loss": 0.3514, "step": 2956 }, { "epoch": 2.924826904055391, "grad_norm": 0.23110200374260195, "learning_rate": 1.3924514474166362e-06, "loss": 0.4092, "step": 2957 }, { "epoch": 2.9258160237388724, "grad_norm": 0.20321476984964387, "learning_rate": 1.3741297178453645e-06, "loss": 0.3342, "step": 2958 }, { "epoch": 2.926805143422354, "grad_norm": 0.18821044499165784, "learning_rate": 1.3558079882740933e-06, "loss": 0.2944, "step": 2959 }, { "epoch": 2.927794263105836, "grad_norm": 0.2133703549279981, "learning_rate": 1.3374862587028216e-06, "loss": 0.3528, "step": 2960 }, { "epoch": 2.9287833827893177, "grad_norm": 0.19949471309765202, "learning_rate": 1.3191645291315501e-06, "loss": 0.3012, "step": 2961 }, { "epoch": 2.929772502472799, "grad_norm": 0.20980980030921756, "learning_rate": 1.3008427995602787e-06, "loss": 0.3459, "step": 2962 }, { "epoch": 2.9307616221562807, "grad_norm": 0.20593464559520894, "learning_rate": 1.282521069989007e-06, "loss": 0.3328, "step": 2963 }, { "epoch": 2.9317507418397626, "grad_norm": 0.20716599210238562, "learning_rate": 1.2641993404177355e-06, "loss": 0.3473, "step": 2964 }, { "epoch": 2.9327398615232445, "grad_norm": 0.22715584225040256, "learning_rate": 1.245877610846464e-06, "loss": 0.3487, "step": 2965 }, { "epoch": 2.933728981206726, "grad_norm": 0.19923436499191036, "learning_rate": 1.2275558812751924e-06, "loss": 0.3245, "step": 2966 }, { "epoch": 2.9347181008902075, "grad_norm": 0.21117769437499914, "learning_rate": 1.209234151703921e-06, "loss": 0.3417, "step": 2967 }, { "epoch": 2.9357072205736894, "grad_norm": 0.2198013877108107, "learning_rate": 1.1909124221326494e-06, "loss": 0.3786, "step": 2968 }, { "epoch": 2.9366963402571713, "grad_norm": 0.20015001058595291, "learning_rate": 1.1725906925613778e-06, "loss": 0.3256, "step": 2969 }, { "epoch": 2.9376854599406528, "grad_norm": 0.21276982029063424, "learning_rate": 1.1542689629901063e-06, "loss": 0.3417, "step": 2970 }, { "epoch": 2.9386745796241343, "grad_norm": 0.22527713295979443, "learning_rate": 1.1359472334188348e-06, "loss": 0.3625, "step": 2971 }, { "epoch": 2.939663699307616, "grad_norm": 0.201432544398078, "learning_rate": 1.1176255038475632e-06, "loss": 0.3073, "step": 2972 }, { "epoch": 2.940652818991098, "grad_norm": 7.943101719255052, "learning_rate": 1.0993037742762917e-06, "loss": 0.8247, "step": 2973 }, { "epoch": 2.9416419386745796, "grad_norm": 0.21340135227018642, "learning_rate": 1.0809820447050202e-06, "loss": 0.3784, "step": 2974 }, { "epoch": 2.9426310583580615, "grad_norm": 0.2113735158072839, "learning_rate": 1.0626603151337486e-06, "loss": 0.3574, "step": 2975 }, { "epoch": 2.943620178041543, "grad_norm": 0.19609645027272843, "learning_rate": 1.044338585562477e-06, "loss": 0.3304, "step": 2976 }, { "epoch": 2.944609297725025, "grad_norm": 0.21099389986750994, "learning_rate": 1.0260168559912056e-06, "loss": 0.346, "step": 2977 }, { "epoch": 2.9455984174085064, "grad_norm": 0.21796864077492323, "learning_rate": 1.0076951264199342e-06, "loss": 0.3519, "step": 2978 }, { "epoch": 2.9465875370919883, "grad_norm": 0.2382189042915159, "learning_rate": 9.893733968486625e-07, "loss": 0.3673, "step": 2979 }, { "epoch": 2.9475766567754698, "grad_norm": 0.20419821656474832, "learning_rate": 9.71051667277391e-07, "loss": 0.3464, "step": 2980 }, { "epoch": 2.9485657764589517, "grad_norm": 0.22395363781798322, "learning_rate": 9.527299377061195e-07, "loss": 0.3279, "step": 2981 }, { "epoch": 2.949554896142433, "grad_norm": 0.20253721100448752, "learning_rate": 9.344082081348481e-07, "loss": 0.3368, "step": 2982 }, { "epoch": 2.950544015825915, "grad_norm": 0.2022713938179965, "learning_rate": 9.160864785635764e-07, "loss": 0.3311, "step": 2983 }, { "epoch": 2.9515331355093966, "grad_norm": 0.20595080269509453, "learning_rate": 8.977647489923048e-07, "loss": 0.352, "step": 2984 }, { "epoch": 2.9525222551928785, "grad_norm": 0.1987103967446778, "learning_rate": 8.794430194210335e-07, "loss": 0.3465, "step": 2985 }, { "epoch": 2.95351137487636, "grad_norm": 0.2057786134643462, "learning_rate": 8.611212898497619e-07, "loss": 0.3466, "step": 2986 }, { "epoch": 2.954500494559842, "grad_norm": 0.21967452126576467, "learning_rate": 8.427995602784902e-07, "loss": 0.3539, "step": 2987 }, { "epoch": 2.9554896142433233, "grad_norm": 0.22985902892440865, "learning_rate": 8.244778307072189e-07, "loss": 0.3911, "step": 2988 }, { "epoch": 2.9564787339268053, "grad_norm": 0.2201316492538097, "learning_rate": 8.061561011359473e-07, "loss": 0.3493, "step": 2989 }, { "epoch": 2.9574678536102867, "grad_norm": 0.19954226377424544, "learning_rate": 7.878343715646757e-07, "loss": 0.3463, "step": 2990 }, { "epoch": 2.9584569732937687, "grad_norm": 0.20222148981802693, "learning_rate": 7.695126419934042e-07, "loss": 0.3135, "step": 2991 }, { "epoch": 2.95944609297725, "grad_norm": 0.21871651088863459, "learning_rate": 7.511909124221327e-07, "loss": 0.3676, "step": 2992 }, { "epoch": 2.960435212660732, "grad_norm": 0.21927670774593283, "learning_rate": 7.328691828508612e-07, "loss": 0.3351, "step": 2993 }, { "epoch": 2.9614243323442135, "grad_norm": 0.208780563559218, "learning_rate": 7.145474532795896e-07, "loss": 0.3447, "step": 2994 }, { "epoch": 2.9624134520276955, "grad_norm": 0.2126920301550537, "learning_rate": 6.962257237083181e-07, "loss": 0.3608, "step": 2995 }, { "epoch": 2.963402571711177, "grad_norm": 0.20548958069933887, "learning_rate": 6.779039941370466e-07, "loss": 0.3304, "step": 2996 }, { "epoch": 2.964391691394659, "grad_norm": 0.2036679215154334, "learning_rate": 6.595822645657751e-07, "loss": 0.3463, "step": 2997 }, { "epoch": 2.9653808110781403, "grad_norm": 0.21172321613940526, "learning_rate": 6.412605349945035e-07, "loss": 0.3599, "step": 2998 }, { "epoch": 2.9663699307616223, "grad_norm": 0.21160961055287664, "learning_rate": 6.22938805423232e-07, "loss": 0.3891, "step": 2999 }, { "epoch": 2.9673590504451037, "grad_norm": 0.21327213001738943, "learning_rate": 6.046170758519605e-07, "loss": 0.3691, "step": 3000 }, { "epoch": 2.9683481701285857, "grad_norm": 0.21547058365602503, "learning_rate": 5.862953462806889e-07, "loss": 0.3385, "step": 3001 }, { "epoch": 2.969337289812067, "grad_norm": 0.20224924064139008, "learning_rate": 5.679736167094174e-07, "loss": 0.3285, "step": 3002 }, { "epoch": 2.970326409495549, "grad_norm": 0.219988718165209, "learning_rate": 5.496518871381459e-07, "loss": 0.3431, "step": 3003 }, { "epoch": 2.9713155291790305, "grad_norm": 0.22283985943183246, "learning_rate": 5.313301575668743e-07, "loss": 0.3386, "step": 3004 }, { "epoch": 2.9723046488625124, "grad_norm": 0.20042527762749443, "learning_rate": 5.130084279956028e-07, "loss": 0.3339, "step": 3005 }, { "epoch": 2.973293768545994, "grad_norm": 0.9140786441671352, "learning_rate": 4.946866984243312e-07, "loss": 0.3563, "step": 3006 }, { "epoch": 2.974282888229476, "grad_norm": 0.20417408391682282, "learning_rate": 4.763649688530597e-07, "loss": 0.3401, "step": 3007 }, { "epoch": 2.9752720079129573, "grad_norm": 0.20602866956947444, "learning_rate": 4.580432392817882e-07, "loss": 0.3247, "step": 3008 }, { "epoch": 2.9762611275964392, "grad_norm": 0.21276857507850871, "learning_rate": 4.3972150971051674e-07, "loss": 0.3594, "step": 3009 }, { "epoch": 2.9772502472799207, "grad_norm": 0.19943196340758107, "learning_rate": 4.213997801392451e-07, "loss": 0.3296, "step": 3010 }, { "epoch": 2.9782393669634026, "grad_norm": 0.21268206131905573, "learning_rate": 4.0307805056797366e-07, "loss": 0.3423, "step": 3011 }, { "epoch": 2.979228486646884, "grad_norm": 0.2057146467485511, "learning_rate": 3.847563209967021e-07, "loss": 0.3186, "step": 3012 }, { "epoch": 2.980217606330366, "grad_norm": 0.20744284251947426, "learning_rate": 3.664345914254306e-07, "loss": 0.3208, "step": 3013 }, { "epoch": 2.9812067260138475, "grad_norm": 0.20532924838610409, "learning_rate": 3.4811286185415905e-07, "loss": 0.3296, "step": 3014 }, { "epoch": 2.9821958456973294, "grad_norm": 0.2033415582326357, "learning_rate": 3.2979113228288753e-07, "loss": 0.3257, "step": 3015 }, { "epoch": 2.9831849653808113, "grad_norm": 0.20754506694784877, "learning_rate": 3.11469402711616e-07, "loss": 0.3296, "step": 3016 }, { "epoch": 2.984174085064293, "grad_norm": 0.20629067666131826, "learning_rate": 2.9314767314034444e-07, "loss": 0.3189, "step": 3017 }, { "epoch": 2.9851632047477743, "grad_norm": 0.22938650262981103, "learning_rate": 2.748259435690729e-07, "loss": 0.3406, "step": 3018 }, { "epoch": 2.986152324431256, "grad_norm": 0.20636895308921352, "learning_rate": 2.565042139978014e-07, "loss": 0.3344, "step": 3019 }, { "epoch": 2.987141444114738, "grad_norm": 0.20357483574027163, "learning_rate": 2.3818248442652986e-07, "loss": 0.3484, "step": 3020 }, { "epoch": 2.9881305637982196, "grad_norm": 0.214430972642966, "learning_rate": 2.1986075485525837e-07, "loss": 0.3679, "step": 3021 }, { "epoch": 2.989119683481701, "grad_norm": 0.19589997688875407, "learning_rate": 2.0153902528398683e-07, "loss": 0.3265, "step": 3022 }, { "epoch": 2.990108803165183, "grad_norm": 0.23345971886153838, "learning_rate": 1.832172957127153e-07, "loss": 0.3985, "step": 3023 }, { "epoch": 2.991097922848665, "grad_norm": 0.1981503597152391, "learning_rate": 1.6489556614144377e-07, "loss": 0.3099, "step": 3024 }, { "epoch": 2.9920870425321464, "grad_norm": 0.2145800019812682, "learning_rate": 1.4657383657017222e-07, "loss": 0.3628, "step": 3025 }, { "epoch": 2.993076162215628, "grad_norm": 0.1993615760861862, "learning_rate": 1.282521069989007e-07, "loss": 0.3104, "step": 3026 }, { "epoch": 2.99406528189911, "grad_norm": 0.22261786031016298, "learning_rate": 1.0993037742762919e-07, "loss": 0.373, "step": 3027 }, { "epoch": 2.9950544015825917, "grad_norm": 0.2022644187391328, "learning_rate": 9.160864785635765e-08, "loss": 0.3516, "step": 3028 }, { "epoch": 2.996043521266073, "grad_norm": 0.19548068811302774, "learning_rate": 7.328691828508611e-08, "loss": 0.3303, "step": 3029 }, { "epoch": 2.9970326409495547, "grad_norm": 0.202037553129905, "learning_rate": 5.496518871381459e-08, "loss": 0.3579, "step": 3030 }, { "epoch": 2.9980217606330366, "grad_norm": 0.19575234484936066, "learning_rate": 3.6643459142543055e-08, "loss": 0.3171, "step": 3031 }, { "epoch": 2.9990108803165185, "grad_norm": 0.21172137281490067, "learning_rate": 1.8321729571271528e-08, "loss": 0.3511, "step": 3032 }, { "epoch": 3.0, "grad_norm": 0.20754262490189151, "learning_rate": 0.0, "loss": 0.3455, "step": 3033 }, { "epoch": 3.0, "step": 3033, "total_flos": 2.5830390820744724e+18, "train_loss": 0.5209268033307966, "train_runtime": 175746.8725, "train_samples_per_second": 0.276, "train_steps_per_second": 0.017 } ], "logging_steps": 1, "max_steps": 3033, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5830390820744724e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }