| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.3770111883614428, | |
| "eval_steps": 500, | |
| "global_step": 34000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 3.2580924034118652, | |
| "learning_rate": 2.2172949002217296e-06, | |
| "loss": 10.2933, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.4347386360168457, | |
| "learning_rate": 4.434589800443459e-06, | |
| "loss": 10.1894, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.3895885944366455, | |
| "learning_rate": 6.651884700665188e-06, | |
| "loss": 10.1424, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.129647731781006, | |
| "learning_rate": 8.869179600886918e-06, | |
| "loss": 10.0995, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.3564186096191406, | |
| "learning_rate": 1.1086474501108649e-05, | |
| "loss": 10.0479, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 1.830551028251648, | |
| "learning_rate": 1.3303769401330377e-05, | |
| "loss": 9.9971, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.1173911094665527, | |
| "learning_rate": 1.5521064301552106e-05, | |
| "loss": 9.9201, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 1.6636557579040527, | |
| "learning_rate": 1.7738359201773837e-05, | |
| "loss": 9.8562, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.4503839015960693, | |
| "learning_rate": 1.9955654101995567e-05, | |
| "loss": 9.7599, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.822424054145813, | |
| "learning_rate": 2.2172949002217298e-05, | |
| "loss": 9.6608, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.6598998308181763, | |
| "learning_rate": 2.4390243902439026e-05, | |
| "loss": 9.55, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.8471707105636597, | |
| "learning_rate": 2.6607538802660753e-05, | |
| "loss": 9.4606, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.4833533763885498, | |
| "learning_rate": 2.8824833702882487e-05, | |
| "loss": 9.3283, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.688541054725647, | |
| "learning_rate": 3.104212860310421e-05, | |
| "loss": 9.2229, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.6466543674468994, | |
| "learning_rate": 3.325942350332594e-05, | |
| "loss": 9.1093, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.4169293642044067, | |
| "learning_rate": 3.547671840354767e-05, | |
| "loss": 8.9703, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.7079193592071533, | |
| "learning_rate": 3.7694013303769404e-05, | |
| "loss": 8.8351, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.5513204336166382, | |
| "learning_rate": 3.9911308203991135e-05, | |
| "loss": 8.7111, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.485573172569275, | |
| "learning_rate": 4.212860310421286e-05, | |
| "loss": 8.5627, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.511690616607666, | |
| "learning_rate": 4.4345898004434597e-05, | |
| "loss": 8.5042, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 2.1478614807128906, | |
| "learning_rate": 4.656319290465632e-05, | |
| "loss": 8.3287, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.4060652256011963, | |
| "learning_rate": 4.878048780487805e-05, | |
| "loss": 8.2341, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.3950035572052002, | |
| "learning_rate": 5.099778270509978e-05, | |
| "loss": 8.1277, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.5197688341140747, | |
| "learning_rate": 5.3215077605321506e-05, | |
| "loss": 8.0311, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.3406693935394287, | |
| "learning_rate": 5.543237250554324e-05, | |
| "loss": 7.9824, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.4520119428634644, | |
| "learning_rate": 5.7649667405764975e-05, | |
| "loss": 7.9948, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.179124116897583, | |
| "learning_rate": 5.98669623059867e-05, | |
| "loss": 7.9144, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.4039533138275146, | |
| "learning_rate": 6.208425720620842e-05, | |
| "loss": 7.8768, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.5542700290679932, | |
| "learning_rate": 6.430155210643016e-05, | |
| "loss": 7.894, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.4150550365447998, | |
| "learning_rate": 6.651884700665188e-05, | |
| "loss": 7.8409, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.6647827625274658, | |
| "learning_rate": 6.873614190687362e-05, | |
| "loss": 7.91, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.7795697450637817, | |
| "learning_rate": 7.095343680709535e-05, | |
| "loss": 7.8256, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.933110237121582, | |
| "learning_rate": 7.317073170731707e-05, | |
| "loss": 7.8463, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.1942570209503174, | |
| "learning_rate": 7.538802660753881e-05, | |
| "loss": 7.7827, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.6759297847747803, | |
| "learning_rate": 7.760532150776053e-05, | |
| "loss": 7.8, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.093256950378418, | |
| "learning_rate": 7.982261640798227e-05, | |
| "loss": 7.7461, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.567872166633606, | |
| "learning_rate": 8.2039911308204e-05, | |
| "loss": 7.7338, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.3017679452896118, | |
| "learning_rate": 8.425720620842572e-05, | |
| "loss": 7.804, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.7510960102081299, | |
| "learning_rate": 8.647450110864746e-05, | |
| "loss": 7.7405, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.7215120792388916, | |
| "learning_rate": 8.869179600886919e-05, | |
| "loss": 7.7429, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.6202715635299683, | |
| "learning_rate": 9.090909090909092e-05, | |
| "loss": 7.6588, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.5680756568908691, | |
| "learning_rate": 9.312638580931264e-05, | |
| "loss": 7.6224, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.462240219116211, | |
| "learning_rate": 9.534368070953438e-05, | |
| "loss": 7.6851, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 2.2018320560455322, | |
| "learning_rate": 9.75609756097561e-05, | |
| "loss": 7.6443, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.9520208835601807, | |
| "learning_rate": 9.977827050997783e-05, | |
| "loss": 7.6456, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.115421175956726, | |
| "learning_rate": 0.00010199556541019956, | |
| "loss": 7.5894, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.6002250909805298, | |
| "learning_rate": 0.0001042128603104213, | |
| "loss": 7.6017, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.6516796350479126, | |
| "learning_rate": 0.00010643015521064301, | |
| "loss": 7.4548, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.2168257236480713, | |
| "learning_rate": 0.00010864745011086475, | |
| "loss": 7.5867, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.5447593927383423, | |
| "learning_rate": 0.00011086474501108647, | |
| "loss": 7.5317, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.6840906143188477, | |
| "learning_rate": 0.00011308203991130821, | |
| "loss": 7.5127, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.2965503931045532, | |
| "learning_rate": 0.00011529933481152995, | |
| "loss": 7.4911, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.643584966659546, | |
| "learning_rate": 0.00011751662971175166, | |
| "loss": 7.4416, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.5419111251831055, | |
| "learning_rate": 0.0001197339246119734, | |
| "loss": 7.4944, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.7774205207824707, | |
| "learning_rate": 0.00012195121951219512, | |
| "loss": 7.4244, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.1709322929382324, | |
| "learning_rate": 0.00012416851441241685, | |
| "loss": 7.371, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.5503411293029785, | |
| "learning_rate": 0.0001263858093126386, | |
| "loss": 7.3031, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.7744035720825195, | |
| "learning_rate": 0.00012860310421286032, | |
| "loss": 7.3338, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.2014000415802, | |
| "learning_rate": 0.00013082039911308205, | |
| "loss": 7.2962, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.6716220378875732, | |
| "learning_rate": 0.00013303769401330377, | |
| "loss": 7.3348, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.7045074701309204, | |
| "learning_rate": 0.0001352549889135255, | |
| "loss": 7.2864, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.8933771848678589, | |
| "learning_rate": 0.00013747228381374724, | |
| "loss": 7.2744, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.298779249191284, | |
| "learning_rate": 0.00013968957871396897, | |
| "loss": 7.2472, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.3420922756195068, | |
| "learning_rate": 0.0001419068736141907, | |
| "loss": 7.3019, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.9339039325714111, | |
| "learning_rate": 0.00014412416851441242, | |
| "loss": 7.2982, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.69667387008667, | |
| "learning_rate": 0.00014634146341463414, | |
| "loss": 7.2851, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.3124189376831055, | |
| "learning_rate": 0.0001485587583148559, | |
| "loss": 7.258, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.975651741027832, | |
| "learning_rate": 0.00015077605321507762, | |
| "loss": 7.1275, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.9704022407531738, | |
| "learning_rate": 0.00015299334811529934, | |
| "loss": 7.1473, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.5047757625579834, | |
| "learning_rate": 0.00015521064301552106, | |
| "loss": 7.1096, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.5465894937515259, | |
| "learning_rate": 0.0001574279379157428, | |
| "loss": 7.1501, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.9557933807373047, | |
| "learning_rate": 0.00015964523281596454, | |
| "loss": 7.2033, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.420116424560547, | |
| "learning_rate": 0.00016186252771618626, | |
| "loss": 7.1275, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.114737033843994, | |
| "learning_rate": 0.000164079822616408, | |
| "loss": 7.0932, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.3085389137268066, | |
| "learning_rate": 0.00016629711751662974, | |
| "loss": 7.0311, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.5679140090942383, | |
| "learning_rate": 0.00016851441241685144, | |
| "loss": 6.9168, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.8611838817596436, | |
| "learning_rate": 0.0001707317073170732, | |
| "loss": 7.0085, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.8603994846343994, | |
| "learning_rate": 0.0001729490022172949, | |
| "loss": 6.9432, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.4244627952575684, | |
| "learning_rate": 0.00017516629711751663, | |
| "loss": 6.9333, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.177870750427246, | |
| "learning_rate": 0.00017738359201773839, | |
| "loss": 6.9499, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.9320554733276367, | |
| "learning_rate": 0.00017960088691796008, | |
| "loss": 6.8204, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.5062849521636963, | |
| "learning_rate": 0.00018181818181818183, | |
| "loss": 6.9505, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.9272422790527344, | |
| "learning_rate": 0.00018403547671840356, | |
| "loss": 6.8701, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.0309596061706543, | |
| "learning_rate": 0.00018625277161862528, | |
| "loss": 6.924, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.0265886783599854, | |
| "learning_rate": 0.00018847006651884703, | |
| "loss": 6.9223, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.5160486698150635, | |
| "learning_rate": 0.00019068736141906876, | |
| "loss": 6.8708, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.613301992416382, | |
| "learning_rate": 0.00019290465631929045, | |
| "loss": 6.8937, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.3031229972839355, | |
| "learning_rate": 0.0001951219512195122, | |
| "loss": 6.8337, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.54779052734375, | |
| "learning_rate": 0.00019733924611973393, | |
| "loss": 6.8334, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.8277971744537354, | |
| "learning_rate": 0.00019955654101995565, | |
| "loss": 6.7925, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.0113885402679443, | |
| "learning_rate": 0.00019999989242739025, | |
| "loss": 6.8458, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.2395377159118652, | |
| "learning_rate": 0.00019999945541405976, | |
| "loss": 6.6251, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.445993423461914, | |
| "learning_rate": 0.0001999986822381884, | |
| "loss": 6.8099, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 4.077752590179443, | |
| "learning_rate": 0.0001999975729023753, | |
| "loss": 6.8053, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 3.167569875717163, | |
| "learning_rate": 0.00019999612741034963, | |
| "loss": 6.7706, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.893659234046936, | |
| "learning_rate": 0.00019999434576697066, | |
| "loss": 6.8245, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 3.6101326942443848, | |
| "learning_rate": 0.00019999222797822762, | |
| "loss": 6.7407, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.2858726978302, | |
| "learning_rate": 0.00019998977405123974, | |
| "loss": 6.74, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.9325459003448486, | |
| "learning_rate": 0.0001999869839942563, | |
| "loss": 6.716, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.0043437480926514, | |
| "learning_rate": 0.00019998385781665643, | |
| "loss": 6.6003, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 4.151523113250732, | |
| "learning_rate": 0.00019998039552894924, | |
| "loss": 6.6801, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.8407771587371826, | |
| "learning_rate": 0.00019997659714277372, | |
| "loss": 6.608, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.230713129043579, | |
| "learning_rate": 0.00019997246267089867, | |
| "loss": 6.6479, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.2546942234039307, | |
| "learning_rate": 0.0001999679921272227, | |
| "loss": 6.6548, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.180986166000366, | |
| "learning_rate": 0.00019996318552677425, | |
| "loss": 6.6851, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.341231346130371, | |
| "learning_rate": 0.00019995804288571134, | |
| "loss": 6.547, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.1117124557495117, | |
| "learning_rate": 0.00019995256422132172, | |
| "loss": 6.7072, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.0082530975341797, | |
| "learning_rate": 0.0001999467495520227, | |
| "loss": 6.5422, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.409489870071411, | |
| "learning_rate": 0.0001999405988973611, | |
| "loss": 6.3716, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.649052381515503, | |
| "learning_rate": 0.00019993411227801328, | |
| "loss": 6.6434, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.081116199493408, | |
| "learning_rate": 0.00019992728971578492, | |
| "loss": 6.4624, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.1578280925750732, | |
| "learning_rate": 0.00019992013123361102, | |
| "loss": 6.5416, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.7874557971954346, | |
| "learning_rate": 0.0001999126368555559, | |
| "loss": 6.4512, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.7693099975585938, | |
| "learning_rate": 0.00019990480660681293, | |
| "loss": 6.5105, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.4338185787200928, | |
| "learning_rate": 0.00019989680712666593, | |
| "loss": 6.5092, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.656937837600708, | |
| "learning_rate": 0.00019988831193270577, | |
| "loss": 6.4269, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.857292652130127, | |
| "learning_rate": 0.00019987948094982952, | |
| "loss": 6.4387, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.4963467121124268, | |
| "learning_rate": 0.00019987031420772385, | |
| "loss": 6.3851, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.602522611618042, | |
| "learning_rate": 0.00019986081173720396, | |
| "loss": 6.3413, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.6455273628234863, | |
| "learning_rate": 0.00019985097357021385, | |
| "loss": 6.2965, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.5592167377471924, | |
| "learning_rate": 0.0001998407997398259, | |
| "loss": 6.4293, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.6016533374786377, | |
| "learning_rate": 0.00019983029028024094, | |
| "loss": 6.2897, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.5536839962005615, | |
| "learning_rate": 0.000199819445226788, | |
| "loss": 6.3157, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.0514349937438965, | |
| "learning_rate": 0.00019980826461592427, | |
| "loss": 6.3847, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.72495174407959, | |
| "learning_rate": 0.00019979674848523505, | |
| "loss": 6.3517, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.4264872074127197, | |
| "learning_rate": 0.00019978489687343335, | |
| "loss": 6.2533, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.8361423015594482, | |
| "learning_rate": 0.0001997727098203602, | |
| "loss": 6.3654, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.9690892696380615, | |
| "learning_rate": 0.00019976018736698404, | |
| "loss": 6.3968, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.6132867336273193, | |
| "learning_rate": 0.0001997473295554009, | |
| "loss": 6.3444, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 4.820697784423828, | |
| "learning_rate": 0.00019973413642883424, | |
| "loss": 6.2019, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.2316782474517822, | |
| "learning_rate": 0.00019972060803163458, | |
| "loss": 6.2049, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.9528305530548096, | |
| "learning_rate": 0.00019970674440927957, | |
| "loss": 6.1718, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.891073226928711, | |
| "learning_rate": 0.0001996925456083738, | |
| "loss": 6.2393, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.813270092010498, | |
| "learning_rate": 0.00019967801167664853, | |
| "loss": 6.2116, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.2726826667785645, | |
| "learning_rate": 0.00019966314266296173, | |
| "loss": 6.1521, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.3895318508148193, | |
| "learning_rate": 0.00019964793861729772, | |
| "loss": 6.1072, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.190431833267212, | |
| "learning_rate": 0.000199632399590767, | |
| "loss": 6.2009, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.79266095161438, | |
| "learning_rate": 0.00019961652563560634, | |
| "loss": 6.028, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.260039806365967, | |
| "learning_rate": 0.00019960031680517826, | |
| "loss": 6.0733, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.0739686489105225, | |
| "learning_rate": 0.0001995837731539711, | |
| "loss": 6.0521, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.0517771244049072, | |
| "learning_rate": 0.00019956689473759872, | |
| "loss": 6.0544, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.9524648189544678, | |
| "learning_rate": 0.0001995496816128003, | |
| "loss": 6.1326, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.498497486114502, | |
| "learning_rate": 0.00019953213383744033, | |
| "loss": 6.236, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.157576084136963, | |
| "learning_rate": 0.00019951425147050807, | |
| "loss": 5.9898, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.9297516345977783, | |
| "learning_rate": 0.00019949603457211775, | |
| "loss": 6.086, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.3214786052703857, | |
| "learning_rate": 0.00019947748320350804, | |
| "loss": 5.9589, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.8847291469573975, | |
| "learning_rate": 0.00019945859742704201, | |
| "loss": 6.1931, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.387896776199341, | |
| "learning_rate": 0.00019943937730620702, | |
| "loss": 6.0539, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.1214797496795654, | |
| "learning_rate": 0.00019941982290561417, | |
| "loss": 6.0288, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.7995123863220215, | |
| "learning_rate": 0.00019939993429099841, | |
| "loss": 6.0526, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.788393974304199, | |
| "learning_rate": 0.00019937971152921818, | |
| "loss": 5.9799, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.009220123291016, | |
| "learning_rate": 0.0001993591546882552, | |
| "loss": 6.1223, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.5576276779174805, | |
| "learning_rate": 0.00019933826383721428, | |
| "loss": 5.989, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.1287412643432617, | |
| "learning_rate": 0.00019931703904632294, | |
| "loss": 6.0542, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.6518595218658447, | |
| "learning_rate": 0.00019929548038693146, | |
| "loss": 6.041, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.268080472946167, | |
| "learning_rate": 0.0001992735879315123, | |
| "loss": 5.888, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.6055593490600586, | |
| "learning_rate": 0.00019925136175366007, | |
| "loss": 5.913, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 4.866463661193848, | |
| "learning_rate": 0.00019922880192809137, | |
| "loss": 5.9858, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.44808292388916, | |
| "learning_rate": 0.00019920590853064423, | |
| "loss": 5.7686, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 2.9507765769958496, | |
| "learning_rate": 0.00019918268163827808, | |
| "loss": 5.8557, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.441870927810669, | |
| "learning_rate": 0.00019915912132907352, | |
| "loss": 5.8268, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.838809013366699, | |
| "learning_rate": 0.00019913522768223182, | |
| "loss": 5.9833, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 4.165487289428711, | |
| "learning_rate": 0.00019911100077807498, | |
| "loss": 5.7422, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.5947463512420654, | |
| "learning_rate": 0.0001990864406980452, | |
| "loss": 5.7479, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 4.130446434020996, | |
| "learning_rate": 0.00019906154752470472, | |
| "loss": 5.7767, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 4.866550922393799, | |
| "learning_rate": 0.00019903632134173554, | |
| "loss": 5.7681, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.2839725017547607, | |
| "learning_rate": 0.00019901076223393903, | |
| "loss": 5.6656, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.0762476921081543, | |
| "learning_rate": 0.0001989848702872359, | |
| "loss": 5.789, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.7109107971191406, | |
| "learning_rate": 0.00019895864558866556, | |
| "loss": 5.773, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 5.400998115539551, | |
| "learning_rate": 0.00019893208822638618, | |
| "loss": 5.7506, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.3062849044799805, | |
| "learning_rate": 0.00019890519828967413, | |
| "loss": 5.7515, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.109920501708984, | |
| "learning_rate": 0.00019887797586892373, | |
| "loss": 5.7972, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.4838390350341797, | |
| "learning_rate": 0.00019885042105564717, | |
| "loss": 5.6753, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.251760959625244, | |
| "learning_rate": 0.00019882253394247381, | |
| "loss": 5.6303, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.042376518249512, | |
| "learning_rate": 0.00019879431462315025, | |
| "loss": 5.5753, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.239652633666992, | |
| "learning_rate": 0.0001987657631925398, | |
| "loss": 5.5335, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 5.15481424331665, | |
| "learning_rate": 0.00019873687974662215, | |
| "loss": 5.5396, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.36835241317749, | |
| "learning_rate": 0.00019870766438249317, | |
| "loss": 5.6017, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.165258407592773, | |
| "learning_rate": 0.00019867811719836452, | |
| "loss": 5.7228, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.125988006591797, | |
| "learning_rate": 0.0001986482382935633, | |
| "loss": 5.5787, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.177731037139893, | |
| "learning_rate": 0.0001986180277685317, | |
| "loss": 5.5829, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 5.006561279296875, | |
| "learning_rate": 0.00019858748572482683, | |
| "loss": 5.5466, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.33070182800293, | |
| "learning_rate": 0.00019855661226512007, | |
| "loss": 5.5544, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.358560085296631, | |
| "learning_rate": 0.00019852540749319708, | |
| "loss": 5.4599, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.536096096038818, | |
| "learning_rate": 0.00019849387151395708, | |
| "loss": 5.4983, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.66163444519043, | |
| "learning_rate": 0.0001984620044334129, | |
| "loss": 5.4097, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.4319233894348145, | |
| "learning_rate": 0.00019842980635869024, | |
| "loss": 5.4093, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.98419713973999, | |
| "learning_rate": 0.0001983972773980276, | |
| "loss": 5.4056, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.6354339122772217, | |
| "learning_rate": 0.0001983644176607757, | |
| "loss": 5.3171, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.495342254638672, | |
| "learning_rate": 0.00019833122725739736, | |
| "loss": 5.4521, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.5558671951293945, | |
| "learning_rate": 0.00019829770629946678, | |
| "loss": 5.5158, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.7165732383728027, | |
| "learning_rate": 0.00019826385489966957, | |
| "loss": 5.301, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 6.030915260314941, | |
| "learning_rate": 0.00019822967317180204, | |
| "loss": 5.3316, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 5.385923385620117, | |
| "learning_rate": 0.00019819516123077094, | |
| "loss": 5.3844, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.383516788482666, | |
| "learning_rate": 0.00019816101926755305, | |
| "loss": 5.2995, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.446406364440918, | |
| "learning_rate": 0.00019812585384780055, | |
| "loss": 5.386, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.345483303070068, | |
| "learning_rate": 0.00019809035856388805, | |
| "loss": 5.2815, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.791261672973633, | |
| "learning_rate": 0.00019805453353513813, | |
| "loss": 5.3757, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 5.622151851654053, | |
| "learning_rate": 0.00019801837888198172, | |
| "loss": 5.4405, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.934606075286865, | |
| "learning_rate": 0.0001979818947259579, | |
| "loss": 5.139, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.9659693241119385, | |
| "learning_rate": 0.0001979450811897134, | |
| "loss": 5.1726, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 5.214992046356201, | |
| "learning_rate": 0.00019790793839700226, | |
| "loss": 5.2864, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.5359601974487305, | |
| "learning_rate": 0.00019787046647268524, | |
| "loss": 5.1443, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.26462984085083, | |
| "learning_rate": 0.00019783266554272962, | |
| "loss": 5.0597, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 5.053945064544678, | |
| "learning_rate": 0.00019779453573420873, | |
| "loss": 5.2946, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 6.082211494445801, | |
| "learning_rate": 0.00019775607717530127, | |
| "loss": 5.2075, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.107390403747559, | |
| "learning_rate": 0.00019771728999529132, | |
| "loss": 5.1394, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.58411169052124, | |
| "learning_rate": 0.00019767817432456752, | |
| "loss": 5.1064, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 8.38965892791748, | |
| "learning_rate": 0.00019763952239228627, | |
| "loss": 5.0808, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.885803699493408, | |
| "learning_rate": 0.00019759975669894338, | |
| "loss": 5.0664, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.1605916023254395, | |
| "learning_rate": 0.00019755966290999167, | |
| "loss": 5.2469, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.821887016296387, | |
| "learning_rate": 0.00019751924116021225, | |
| "loss": 5.2451, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.865694761276245, | |
| "learning_rate": 0.00019747849158548858, | |
| "loss": 5.2334, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.640681028366089, | |
| "learning_rate": 0.00019743741432280625, | |
| "loss": 5.1206, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.04166316986084, | |
| "learning_rate": 0.00019739600951025236, | |
| "loss": 5.0059, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.637605667114258, | |
| "learning_rate": 0.00019735427728701516, | |
| "loss": 5.0302, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.08723783493042, | |
| "learning_rate": 0.0001973122177933835, | |
| "loss": 5.1551, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.7944953441619873, | |
| "learning_rate": 0.00019726983117074643, | |
| "loss": 5.0665, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 5.2847371101379395, | |
| "learning_rate": 0.00019722711756159266, | |
| "loss": 5.2212, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.109150409698486, | |
| "learning_rate": 0.00019718407710951012, | |
| "loss": 5.2645, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.127768039703369, | |
| "learning_rate": 0.0001971407099591855, | |
| "loss": 5.0395, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 5.058667182922363, | |
| "learning_rate": 0.00019709701625640367, | |
| "loss": 5.0247, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 5.4407267570495605, | |
| "learning_rate": 0.00019705299614804732, | |
| "loss": 4.9935, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.7877707481384277, | |
| "learning_rate": 0.00019700864978209636, | |
| "loss": 5.074, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.777330160140991, | |
| "learning_rate": 0.00019696397730762746, | |
| "loss": 5.0458, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.143067836761475, | |
| "learning_rate": 0.0001969189788748136, | |
| "loss": 4.9375, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 5.560107231140137, | |
| "learning_rate": 0.00019687365463492344, | |
| "loss": 4.8285, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.057905197143555, | |
| "learning_rate": 0.00019682800474032095, | |
| "loss": 4.9753, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.835442066192627, | |
| "learning_rate": 0.00019678202934446482, | |
| "loss": 4.9368, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 5.135551929473877, | |
| "learning_rate": 0.0001967357286019079, | |
| "loss": 4.9994, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.615053653717041, | |
| "learning_rate": 0.00019668910266829685, | |
| "loss": 5.0182, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.474258899688721, | |
| "learning_rate": 0.0001966421517003714, | |
| "loss": 4.8704, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.264945030212402, | |
| "learning_rate": 0.00019659487585596406, | |
| "loss": 4.9076, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.091209411621094, | |
| "learning_rate": 0.00019654727529399925, | |
| "loss": 4.7135, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.154038429260254, | |
| "learning_rate": 0.00019649935017449318, | |
| "loss": 4.8239, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.697162628173828, | |
| "learning_rate": 0.00019645110065855305, | |
| "loss": 4.9972, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.0024847984313965, | |
| "learning_rate": 0.00019640252690837645, | |
| "loss": 4.8854, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.9416885375976562, | |
| "learning_rate": 0.0001963536290872511, | |
| "loss": 4.8547, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.978651285171509, | |
| "learning_rate": 0.000196304407359554, | |
| "loss": 4.7873, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.435175895690918, | |
| "learning_rate": 0.0001962548618907511, | |
| "loss": 4.8124, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.8776824474334717, | |
| "learning_rate": 0.00019620499284739662, | |
| "loss": 4.8896, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 5.041496276855469, | |
| "learning_rate": 0.00019615480039713248, | |
| "loss": 4.8343, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.18281888961792, | |
| "learning_rate": 0.00019610428470868784, | |
| "loss": 4.8559, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.223630905151367, | |
| "learning_rate": 0.00019605344595187844, | |
| "loss": 4.8153, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.63677453994751, | |
| "learning_rate": 0.0001960022842976061, | |
| "loss": 4.7951, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.188296794891357, | |
| "learning_rate": 0.00019595079991785802, | |
| "loss": 4.8904, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.402559280395508, | |
| "learning_rate": 0.00019589899298570634, | |
| "loss": 4.7851, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.976877212524414, | |
| "learning_rate": 0.00019584686367530755, | |
| "loss": 4.6431, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.849298477172852, | |
| "learning_rate": 0.0001957944121619018, | |
| "loss": 4.7544, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.932714462280273, | |
| "learning_rate": 0.0001957416386218124, | |
| "loss": 4.6811, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.682474136352539, | |
| "learning_rate": 0.00019568854323244515, | |
| "loss": 4.799, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.228520393371582, | |
| "learning_rate": 0.00019563619766470511, | |
| "loss": 4.7622, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.093870162963867, | |
| "learning_rate": 0.00019558246554138458, | |
| "loss": 4.7369, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.248356342315674, | |
| "learning_rate": 0.0001955284121038694, | |
| "loss": 4.7519, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 3.924299955368042, | |
| "learning_rate": 0.00019547403753386803, | |
| "loss": 4.6441, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.972569942474365, | |
| "learning_rate": 0.00019542043906868188, | |
| "loss": 4.7192, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.033604145050049, | |
| "learning_rate": 0.00019536542919665846, | |
| "loss": 4.6397, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.222695350646973, | |
| "learning_rate": 0.00019531009874003928, | |
| "loss": 4.6309, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 3.810999631881714, | |
| "learning_rate": 0.00019525444788482562, | |
| "loss": 4.6513, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.272600173950195, | |
| "learning_rate": 0.00019519847681809585, | |
| "loss": 4.8001, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.836308002471924, | |
| "learning_rate": 0.00019514218572800468, | |
| "loss": 4.7101, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.598148345947266, | |
| "learning_rate": 0.00019508557480378276, | |
| "loss": 4.5578, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.910820722579956, | |
| "learning_rate": 0.0001950286442357358, | |
| "loss": 4.7124, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.856081962585449, | |
| "learning_rate": 0.00019497139421524416, | |
| "loss": 4.7563, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.151907920837402, | |
| "learning_rate": 0.00019491382493476195, | |
| "loss": 4.6726, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.349935054779053, | |
| "learning_rate": 0.0001948559365878166, | |
| "loss": 4.6341, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.8229756355285645, | |
| "learning_rate": 0.00019479772936900811, | |
| "loss": 4.6183, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.495506286621094, | |
| "learning_rate": 0.0001947392034740084, | |
| "loss": 4.6608, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.307513236999512, | |
| "learning_rate": 0.00019468035909956072, | |
| "loss": 4.6805, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.939659595489502, | |
| "learning_rate": 0.0001946211964434788, | |
| "loss": 4.679, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.444967269897461, | |
| "learning_rate": 0.00019456171570464653, | |
| "loss": 4.7195, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.513270854949951, | |
| "learning_rate": 0.00019450191708301687, | |
| "loss": 4.5367, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.617405414581299, | |
| "learning_rate": 0.00019444180077961146, | |
| "loss": 4.5742, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.580646991729736, | |
| "learning_rate": 0.00019438136699652001, | |
| "loss": 4.4936, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.657532691955566, | |
| "learning_rate": 0.00019432061593689927, | |
| "loss": 4.6877, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.374803066253662, | |
| "learning_rate": 0.0001942595478049727, | |
| "loss": 4.6101, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.1111650466918945, | |
| "learning_rate": 0.00019419816280602962, | |
| "loss": 4.6185, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.18306303024292, | |
| "learning_rate": 0.00019413646114642446, | |
| "loss": 4.5524, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.411191463470459, | |
| "learning_rate": 0.00019407444303357624, | |
| "loss": 4.4346, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.161925792694092, | |
| "learning_rate": 0.0001940121086759678, | |
| "loss": 4.3702, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.059813022613525, | |
| "learning_rate": 0.000193949458283145, | |
| "loss": 4.5351, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.563150882720947, | |
| "learning_rate": 0.00019388649206571616, | |
| "loss": 4.477, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.1144609451293945, | |
| "learning_rate": 0.00019382321023535127, | |
| "loss": 4.6033, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.734794616699219, | |
| "learning_rate": 0.00019375961300478127, | |
| "loss": 4.5287, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.543684959411621, | |
| "learning_rate": 0.00019369570058779743, | |
| "loss": 4.4474, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.4647979736328125, | |
| "learning_rate": 0.00019363147319925047, | |
| "loss": 4.3806, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.058681964874268, | |
| "learning_rate": 0.00019356693105505006, | |
| "loss": 4.4998, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.494804859161377, | |
| "learning_rate": 0.00019350207437216386, | |
| "loss": 4.3911, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.227470397949219, | |
| "learning_rate": 0.00019343690336861687, | |
| "loss": 4.2557, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 3.7686829566955566, | |
| "learning_rate": 0.00019337141826349092, | |
| "loss": 4.313, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.975152492523193, | |
| "learning_rate": 0.00019330561927692345, | |
| "loss": 4.2914, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.811885356903076, | |
| "learning_rate": 0.00019323950663010733, | |
| "loss": 4.3566, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.566829204559326, | |
| "learning_rate": 0.00019317308054528966, | |
| "loss": 4.2847, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.977478504180908, | |
| "learning_rate": 0.0001931063412457713, | |
| "loss": 4.3034, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.601086616516113, | |
| "learning_rate": 0.00019303928895590596, | |
| "loss": 4.1929, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.051478385925293, | |
| "learning_rate": 0.0001929719239010996, | |
| "loss": 4.2749, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 6.248847961425781, | |
| "learning_rate": 0.00019290424630780947, | |
| "loss": 4.3419, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.392062664031982, | |
| "learning_rate": 0.0001928362564035436, | |
| "loss": 4.4038, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.6346211433410645, | |
| "learning_rate": 0.00019276795441685975, | |
| "loss": 4.3403, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.646982192993164, | |
| "learning_rate": 0.00019269934057736493, | |
| "loss": 4.252, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.455059051513672, | |
| "learning_rate": 0.00019263041511571438, | |
| "loss": 4.3809, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.478726387023926, | |
| "learning_rate": 0.00019256117826361096, | |
| "loss": 4.1885, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.029292106628418, | |
| "learning_rate": 0.0001924916302538043, | |
| "loss": 4.2615, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.6447978019714355, | |
| "learning_rate": 0.00019242177132009, | |
| "loss": 4.268, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.165138244628906, | |
| "learning_rate": 0.00019235160169730895, | |
| "loss": 4.3222, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.661884784698486, | |
| "learning_rate": 0.00019228112162134641, | |
| "loss": 4.3179, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 6.117990493774414, | |
| "learning_rate": 0.0001922103313291313, | |
| "loss": 4.2241, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 4.299765110015869, | |
| "learning_rate": 0.0001921392310586353, | |
| "loss": 4.2602, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.798460483551025, | |
| "learning_rate": 0.00019206782104887223, | |
| "loss": 4.3096, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.016506671905518, | |
| "learning_rate": 0.00019199610153989712, | |
| "loss": 4.2073, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 9.708767890930176, | |
| "learning_rate": 0.0001919240727728054, | |
| "loss": 4.2099, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 4.904361248016357, | |
| "learning_rate": 0.00019185173498973204, | |
| "loss": 4.2461, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.290199279785156, | |
| "learning_rate": 0.00019177908843385103, | |
| "loss": 4.115, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 6.290179252624512, | |
| "learning_rate": 0.00019170613334937406, | |
| "loss": 4.3295, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.071104526519775, | |
| "learning_rate": 0.00019163286998155027, | |
| "loss": 4.1532, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 4.5464067459106445, | |
| "learning_rate": 0.00019155929857666494, | |
| "loss": 4.0761, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 4.664229393005371, | |
| "learning_rate": 0.0001914854193820389, | |
| "loss": 4.1371, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 7.168484210968018, | |
| "learning_rate": 0.0001914112326460277, | |
| "loss": 4.178, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.570041179656982, | |
| "learning_rate": 0.0001913367386180207, | |
| "loss": 4.1536, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.298222064971924, | |
| "learning_rate": 0.00019126193754844036, | |
| "loss": 4.2089, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 7.139255523681641, | |
| "learning_rate": 0.0001911868296887411, | |
| "loss": 4.1362, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.763050556182861, | |
| "learning_rate": 0.00019111141529140887, | |
| "loss": 4.1106, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.586143493652344, | |
| "learning_rate": 0.00019103569460995998, | |
| "loss": 3.9519, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.827348232269287, | |
| "learning_rate": 0.00019095966789894038, | |
| "loss": 3.9598, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.121611595153809, | |
| "learning_rate": 0.00019088333541392478, | |
| "loss": 4.1347, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.110377788543701, | |
| "learning_rate": 0.00019080669741151581, | |
| "loss": 4.0088, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.672893047332764, | |
| "learning_rate": 0.00019072975414934318, | |
| "loss": 4.0916, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.667397499084473, | |
| "learning_rate": 0.00019065250588606262, | |
| "loss": 4.0695, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.404243469238281, | |
| "learning_rate": 0.0001905749528813553, | |
| "loss": 3.9728, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.912601470947266, | |
| "learning_rate": 0.00019049709539592686, | |
| "loss": 4.029, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.015479564666748, | |
| "learning_rate": 0.00019041893369150636, | |
| "loss": 4.0268, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.656422138214111, | |
| "learning_rate": 0.00019034046803084563, | |
| "loss": 4.0393, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 4.685242176055908, | |
| "learning_rate": 0.00019026169867771825, | |
| "loss": 4.1104, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.503780364990234, | |
| "learning_rate": 0.00019018262589691874, | |
| "loss": 4.0344, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 4.73757266998291, | |
| "learning_rate": 0.00019010324995426156, | |
| "loss": 4.1114, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 7.276214122772217, | |
| "learning_rate": 0.0001900235711165804, | |
| "loss": 3.8838, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.2224273681640625, | |
| "learning_rate": 0.00018994358965172717, | |
| "loss": 3.9479, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.4751996994018555, | |
| "learning_rate": 0.00018986330582857096, | |
| "loss": 4.0079, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 4.874088764190674, | |
| "learning_rate": 0.00018978271991699743, | |
| "loss": 4.1664, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 7.713326454162598, | |
| "learning_rate": 0.0001897018321879077, | |
| "loss": 3.9646, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.753252029418945, | |
| "learning_rate": 0.00018962064291321747, | |
| "loss": 3.8574, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.962434768676758, | |
| "learning_rate": 0.0001895391523658562, | |
| "loss": 3.9757, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.875513553619385, | |
| "learning_rate": 0.00018945736081976607, | |
| "loss": 4.0424, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.298293590545654, | |
| "learning_rate": 0.00018937526854990108, | |
| "loss": 3.958, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 4.98872184753418, | |
| "learning_rate": 0.00018929287583222625, | |
| "loss": 3.9225, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.467836380004883, | |
| "learning_rate": 0.00018921018294371645, | |
| "loss": 3.9369, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.920988082885742, | |
| "learning_rate": 0.0001891271901623558, | |
| "loss": 3.975, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.652931213378906, | |
| "learning_rate": 0.00018904389776713641, | |
| "loss": 3.9067, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.372093200683594, | |
| "learning_rate": 0.00018896030603805767, | |
| "loss": 3.9267, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.743618965148926, | |
| "learning_rate": 0.00018887641525612518, | |
| "loss": 3.8912, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 8.207468032836914, | |
| "learning_rate": 0.00018879222570334985, | |
| "loss": 3.9101, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.930370807647705, | |
| "learning_rate": 0.00018870773766274697, | |
| "loss": 3.8817, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.367077350616455, | |
| "learning_rate": 0.00018862295141833523, | |
| "loss": 3.8931, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.587210178375244, | |
| "learning_rate": 0.00018853786725513575, | |
| "loss": 3.9393, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.502545356750488, | |
| "learning_rate": 0.0001884524854591712, | |
| "loss": 3.8489, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.352043628692627, | |
| "learning_rate": 0.00018836680631746476, | |
| "loss": 3.8162, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.686196804046631, | |
| "learning_rate": 0.00018828083011803917, | |
| "loss": 3.9476, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.225170612335205, | |
| "learning_rate": 0.00018819455714991578, | |
| "loss": 3.9404, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 7.1347150802612305, | |
| "learning_rate": 0.0001881079877031136, | |
| "loss": 3.9798, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.343573093414307, | |
| "learning_rate": 0.0001880211220686482, | |
| "loss": 3.9038, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.858921051025391, | |
| "learning_rate": 0.00018793396053853098, | |
| "loss": 3.8792, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.721033573150635, | |
| "learning_rate": 0.0001878482554434291, | |
| "loss": 3.8421, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.173632621765137, | |
| "learning_rate": 0.00018776050890530516, | |
| "loss": 4.0233, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.996013164520264, | |
| "learning_rate": 0.00018767246734761796, | |
| "loss": 3.8057, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.707641124725342, | |
| "learning_rate": 0.00018758413106633186, | |
| "loss": 3.8299, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 7.221241474151611, | |
| "learning_rate": 0.00018749550035840193, | |
| "loss": 3.8828, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.554357528686523, | |
| "learning_rate": 0.00018740657552177305, | |
| "loss": 3.8553, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 4.664674282073975, | |
| "learning_rate": 0.00018731735685537885, | |
| "loss": 3.8838, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.485450267791748, | |
| "learning_rate": 0.00018722784465914071, | |
| "loss": 3.8165, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.825826644897461, | |
| "learning_rate": 0.00018713803923396668, | |
| "loss": 3.7588, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.392491817474365, | |
| "learning_rate": 0.0001870479408817507, | |
| "loss": 3.8001, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.493740081787109, | |
| "learning_rate": 0.00018695754990537123, | |
| "loss": 3.9735, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.905117511749268, | |
| "learning_rate": 0.00018686686660869062, | |
| "loss": 3.7334, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.598316192626953, | |
| "learning_rate": 0.0001867758912965537, | |
| "loss": 3.8269, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.979629039764404, | |
| "learning_rate": 0.00018668462427478714, | |
| "loss": 3.8713, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.480854511260986, | |
| "learning_rate": 0.00018659306585019813, | |
| "loss": 3.7792, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.820549488067627, | |
| "learning_rate": 0.00018650121633057346, | |
| "loss": 3.6656, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.579679012298584, | |
| "learning_rate": 0.0001864090760246785, | |
| "loss": 3.9109, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.669819355010986, | |
| "learning_rate": 0.00018631664524225615, | |
| "loss": 3.7815, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.644351005554199, | |
| "learning_rate": 0.0001862239242940257, | |
| "loss": 3.7529, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.022332191467285, | |
| "learning_rate": 0.00018613091349168205, | |
| "loss": 3.7001, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.376641273498535, | |
| "learning_rate": 0.00018603761314789425, | |
| "loss": 3.6871, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.298123359680176, | |
| "learning_rate": 0.00018594402357630495, | |
| "loss": 3.8095, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 4.590997695922852, | |
| "learning_rate": 0.00018585014509152882, | |
| "loss": 3.8069, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.710943222045898, | |
| "learning_rate": 0.00018575597800915198, | |
| "loss": 3.8547, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.5094499588012695, | |
| "learning_rate": 0.0001856615226457305, | |
| "loss": 3.7314, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.584799766540527, | |
| "learning_rate": 0.0001855667793187898, | |
| "loss": 3.7514, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.0391154289245605, | |
| "learning_rate": 0.00018547174834682308, | |
| "loss": 3.6231, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.92927885055542, | |
| "learning_rate": 0.00018537643004929067, | |
| "loss": 3.7008, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.359600067138672, | |
| "learning_rate": 0.00018528082474661867, | |
| "loss": 3.798, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.198579788208008, | |
| "learning_rate": 0.0001851849327601981, | |
| "loss": 3.7187, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.796758651733398, | |
| "learning_rate": 0.00018508875441238364, | |
| "loss": 3.7086, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 4.889728546142578, | |
| "learning_rate": 0.00018499229002649258, | |
| "loss": 3.7387, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.382203102111816, | |
| "learning_rate": 0.0001848955399268039, | |
| "loss": 3.5992, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.061376571655273, | |
| "learning_rate": 0.00018479850443855686, | |
| "loss": 3.6865, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.2180681228637695, | |
| "learning_rate": 0.0001847011838879503, | |
| "loss": 3.7467, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.063679218292236, | |
| "learning_rate": 0.0001846035786021412, | |
| "loss": 3.6894, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.036098480224609, | |
| "learning_rate": 0.00018450568890924373, | |
| "loss": 3.6412, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 4.86781644821167, | |
| "learning_rate": 0.00018440751513832822, | |
| "loss": 3.637, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.41668176651001, | |
| "learning_rate": 0.00018430905761941983, | |
| "loss": 3.6814, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.117024898529053, | |
| "learning_rate": 0.00018421031668349773, | |
| "loss": 3.6257, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 7.368699073791504, | |
| "learning_rate": 0.00018411129266249373, | |
| "loss": 3.7111, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 4.378394603729248, | |
| "learning_rate": 0.0001840119858892913, | |
| "loss": 3.7197, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 7.029990196228027, | |
| "learning_rate": 0.0001839123966977245, | |
| "loss": 3.7267, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 9.922813415527344, | |
| "learning_rate": 0.00018381252542257662, | |
| "loss": 3.7203, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 4.9374518394470215, | |
| "learning_rate": 0.00018371237239957932, | |
| "loss": 3.6876, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.682550430297852, | |
| "learning_rate": 0.00018361193796541142, | |
| "loss": 3.6862, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.477772235870361, | |
| "learning_rate": 0.00018351122245769771, | |
| "loss": 3.5982, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 7.745680332183838, | |
| "learning_rate": 0.00018341224888886997, | |
| "loss": 3.6978, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 6.691402912139893, | |
| "learning_rate": 0.0001833109778552932, | |
| "loss": 3.6693, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 6.229629993438721, | |
| "learning_rate": 0.00018320942675989125, | |
| "loss": 3.6327, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.655289649963379, | |
| "learning_rate": 0.0001831075959440427, | |
| "loss": 3.6032, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.4868927001953125, | |
| "learning_rate": 0.00018300548575006658, | |
| "loss": 3.7059, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 7.387706756591797, | |
| "learning_rate": 0.00018290309652122083, | |
| "loss": 3.6838, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 4.884798049926758, | |
| "learning_rate": 0.00018280042860170168, | |
| "loss": 3.665, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 7.185595512390137, | |
| "learning_rate": 0.00018269748233664204, | |
| "loss": 3.6057, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 6.449123382568359, | |
| "learning_rate": 0.0001825942580721106, | |
| "loss": 3.6262, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.469310283660889, | |
| "learning_rate": 0.00018249075615511053, | |
| "loss": 3.522, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.678877353668213, | |
| "learning_rate": 0.0001823869769335784, | |
| "loss": 3.6757, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.033955097198486, | |
| "learning_rate": 0.000182282920756383, | |
| "loss": 3.7316, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 6.790628433227539, | |
| "learning_rate": 0.00018217858797332413, | |
| "loss": 3.545, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 6.205599308013916, | |
| "learning_rate": 0.00018207397893513143, | |
| "loss": 3.6035, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.7604804039001465, | |
| "learning_rate": 0.00018196909399346316, | |
| "loss": 3.6869, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.528883457183838, | |
| "learning_rate": 0.0001818639335009052, | |
| "loss": 3.6493, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 6.46929407119751, | |
| "learning_rate": 0.00018175849781096966, | |
| "loss": 3.639, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.487035274505615, | |
| "learning_rate": 0.00018165278727809368, | |
| "loss": 3.5755, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 6.251669406890869, | |
| "learning_rate": 0.00018154680225763848, | |
| "loss": 3.704, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 6.204404830932617, | |
| "learning_rate": 0.00018144054310588792, | |
| "loss": 3.6071, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.7311482429504395, | |
| "learning_rate": 0.00018133401018004743, | |
| "loss": 3.5395, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 7.110382556915283, | |
| "learning_rate": 0.00018122720383824273, | |
| "loss": 3.6643, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.991401672363281, | |
| "learning_rate": 0.0001811201244395187, | |
| "loss": 3.6752, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.788415431976318, | |
| "learning_rate": 0.0001810127723438381, | |
| "loss": 3.6362, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 4.827778339385986, | |
| "learning_rate": 0.00018090514791208043, | |
| "loss": 3.7298, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.7845916748046875, | |
| "learning_rate": 0.0001807972515060407, | |
| "loss": 3.543, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.071081638336182, | |
| "learning_rate": 0.00018068908348842818, | |
| "loss": 3.5706, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.189342021942139, | |
| "learning_rate": 0.00018058064422286525, | |
| "loss": 3.667, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 7.787344455718994, | |
| "learning_rate": 0.00018047193407388603, | |
| "loss": 3.4985, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 4.411252975463867, | |
| "learning_rate": 0.00018036295340693531, | |
| "loss": 3.6719, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.700460433959961, | |
| "learning_rate": 0.00018025370258836732, | |
| "loss": 3.5075, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.121459007263184, | |
| "learning_rate": 0.00018014418198544432, | |
| "loss": 3.5511, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.29133415222168, | |
| "learning_rate": 0.0001800343919663356, | |
| "loss": 3.7063, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 6.53157377243042, | |
| "learning_rate": 0.00017992433290011604, | |
| "loss": 3.5146, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 6.442373275756836, | |
| "learning_rate": 0.00017981400515676508, | |
| "loss": 3.5431, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.247061729431152, | |
| "learning_rate": 0.00017970340910716522, | |
| "loss": 3.604, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 6.570899486541748, | |
| "learning_rate": 0.000179592545123101, | |
| "loss": 3.6034, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 6.012238502502441, | |
| "learning_rate": 0.00017948141357725764, | |
| "loss": 3.4793, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.5325422286987305, | |
| "learning_rate": 0.0001793700148432198, | |
| "loss": 3.563, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 4.889975070953369, | |
| "learning_rate": 0.00017925834929547035, | |
| "loss": 3.5512, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.225555896759033, | |
| "learning_rate": 0.00017914641730938907, | |
| "loss": 3.5521, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.430109024047852, | |
| "learning_rate": 0.0001790342192612514, | |
| "loss": 3.4549, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.8808274269104, | |
| "learning_rate": 0.00017892175552822716, | |
| "loss": 3.5518, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 6.657894134521484, | |
| "learning_rate": 0.00017880902648837946, | |
| "loss": 3.4643, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 4.968985557556152, | |
| "learning_rate": 0.00017869603252066308, | |
| "loss": 3.5022, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.369678497314453, | |
| "learning_rate": 0.00017858277400492357, | |
| "loss": 3.6906, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 6.433826446533203, | |
| "learning_rate": 0.0001784692513218956, | |
| "loss": 3.4281, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.364591121673584, | |
| "learning_rate": 0.00017835546485320202, | |
| "loss": 3.6194, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.889247894287109, | |
| "learning_rate": 0.00017824141498135244, | |
| "loss": 3.7013, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.254469394683838, | |
| "learning_rate": 0.0001781271020897419, | |
| "loss": 3.4107, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.486823558807373, | |
| "learning_rate": 0.0001780125265626495, | |
| "loss": 3.5453, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.4713311195373535, | |
| "learning_rate": 0.0001778976887852375, | |
| "loss": 3.5482, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 6.2519731521606445, | |
| "learning_rate": 0.00017778258914354946, | |
| "loss": 3.6251, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.657818794250488, | |
| "learning_rate": 0.00017766722802450944, | |
| "loss": 3.5081, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 6.176442623138428, | |
| "learning_rate": 0.0001775516058159204, | |
| "loss": 3.45, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.838647365570068, | |
| "learning_rate": 0.00017743572290646303, | |
| "loss": 3.419, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 6.912227630615234, | |
| "learning_rate": 0.00017731957968569436, | |
| "loss": 3.4892, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 7.276485919952393, | |
| "learning_rate": 0.0001772031765440465, | |
| "loss": 3.5143, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.351586818695068, | |
| "learning_rate": 0.0001770865138728254, | |
| "loss": 3.5467, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.220416069030762, | |
| "learning_rate": 0.00017696959206420937, | |
| "loss": 3.4736, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.282609462738037, | |
| "learning_rate": 0.00017685241151124781, | |
| "loss": 3.4181, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 4.958062171936035, | |
| "learning_rate": 0.00017673497260786006, | |
| "loss": 3.4309, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 6.3785929679870605, | |
| "learning_rate": 0.00017661727574883388, | |
| "loss": 3.3805, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.702798366546631, | |
| "learning_rate": 0.00017649932132982415, | |
| "loss": 3.5371, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 6.496365547180176, | |
| "learning_rate": 0.0001763811097473516, | |
| "loss": 3.4107, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.093421936035156, | |
| "learning_rate": 0.00017626264139880148, | |
| "loss": 3.5514, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.6509480476379395, | |
| "learning_rate": 0.0001761439166824221, | |
| "loss": 3.5612, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.662957191467285, | |
| "learning_rate": 0.00017602493599732372, | |
| "loss": 3.5515, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 7.548245429992676, | |
| "learning_rate": 0.000175905699743477, | |
| "loss": 3.5552, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.2797112464904785, | |
| "learning_rate": 0.00017578620832171173, | |
| "loss": 3.4159, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.431013584136963, | |
| "learning_rate": 0.0001756664621337155, | |
| "loss": 3.4257, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 8.88436222076416, | |
| "learning_rate": 0.00017554646158203236, | |
| "loss": 3.5517, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.538012981414795, | |
| "learning_rate": 0.00017542620707006136, | |
| "loss": 3.4451, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.702478408813477, | |
| "learning_rate": 0.00017530569900205538, | |
| "loss": 3.5453, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.236027240753174, | |
| "learning_rate": 0.00017518493778311957, | |
| "loss": 3.4483, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.824537754058838, | |
| "learning_rate": 0.00017506392381921014, | |
| "loss": 3.507, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.682642459869385, | |
| "learning_rate": 0.0001749426575171329, | |
| "loss": 3.4624, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.02097225189209, | |
| "learning_rate": 0.00017482113928454196, | |
| "loss": 3.4782, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.951188087463379, | |
| "learning_rate": 0.00017469936952993834, | |
| "loss": 3.5305, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.47694730758667, | |
| "learning_rate": 0.00017457734866266854, | |
| "loss": 3.4653, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.073057651519775, | |
| "learning_rate": 0.0001744575249785453, | |
| "loss": 3.4969, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.532285690307617, | |
| "learning_rate": 0.00017433500811915326, | |
| "loss": 3.3932, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.016458988189697, | |
| "learning_rate": 0.00017421224137194837, | |
| "loss": 3.4828, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 7.032898902893066, | |
| "learning_rate": 0.0001740892251496286, | |
| "loss": 3.4347, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.3446431159973145, | |
| "learning_rate": 0.00017396595986573065, | |
| "loss": 3.4101, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.952356815338135, | |
| "learning_rate": 0.00017384244593462859, | |
| "loss": 3.4296, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.403810501098633, | |
| "learning_rate": 0.00017371868377153216, | |
| "loss": 3.4264, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.429996490478516, | |
| "learning_rate": 0.00017359467379248568, | |
| "loss": 3.4341, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.618744850158691, | |
| "learning_rate": 0.00017347041641436653, | |
| "loss": 3.3357, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 6.539459705352783, | |
| "learning_rate": 0.00017334840455978504, | |
| "loss": 3.5718, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.993662357330322, | |
| "learning_rate": 0.00017322365856462736, | |
| "loss": 3.4774, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.6996002197265625, | |
| "learning_rate": 0.00017309866641761798, | |
| "loss": 3.311, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.32814884185791, | |
| "learning_rate": 0.00017297342853893604, | |
| "loss": 3.5558, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.538712501525879, | |
| "learning_rate": 0.0001728479453495866, | |
| "loss": 3.3261, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.975490093231201, | |
| "learning_rate": 0.00017272221727139946, | |
| "loss": 3.5, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.377697467803955, | |
| "learning_rate": 0.00017259624472702764, | |
| "loss": 3.4562, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 6.686251163482666, | |
| "learning_rate": 0.00017247002813994592, | |
| "loss": 3.3968, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 7.116434097290039, | |
| "learning_rate": 0.00017234356793444954, | |
| "loss": 3.3161, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.755252838134766, | |
| "learning_rate": 0.0001722168645356526, | |
| "loss": 3.4195, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 6.647252559661865, | |
| "learning_rate": 0.00017208991836948685, | |
| "loss": 3.1887, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.997719764709473, | |
| "learning_rate": 0.0001719627298627, | |
| "loss": 3.4098, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 6.054971218109131, | |
| "learning_rate": 0.00017183529944285456, | |
| "loss": 3.4159, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.706241130828857, | |
| "learning_rate": 0.00017170762753832615, | |
| "loss": 3.4024, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 7.701054096221924, | |
| "learning_rate": 0.00017157971457830226, | |
| "loss": 3.3564, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.441225528717041, | |
| "learning_rate": 0.00017145156099278067, | |
| "loss": 3.5887, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.631026268005371, | |
| "learning_rate": 0.0001713231672125681, | |
| "loss": 3.352, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 4.974308967590332, | |
| "learning_rate": 0.0001711945336692786, | |
| "loss": 3.3959, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 7.063317775726318, | |
| "learning_rate": 0.00017106566079533246, | |
| "loss": 3.3942, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 6.312389850616455, | |
| "learning_rate": 0.0001709365490239543, | |
| "loss": 3.3928, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.022332668304443, | |
| "learning_rate": 0.00017080719878917182, | |
| "loss": 3.4401, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 4.356366157531738, | |
| "learning_rate": 0.00017067761052581455, | |
| "loss": 3.4353, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.611413478851318, | |
| "learning_rate": 0.00017054778466951196, | |
| "loss": 3.2737, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 7.244396686553955, | |
| "learning_rate": 0.0001704177216566924, | |
| "loss": 3.2309, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 6.024662017822266, | |
| "learning_rate": 0.00017028742192458132, | |
| "loss": 3.3593, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 7.399158954620361, | |
| "learning_rate": 0.00017015688591120006, | |
| "loss": 3.2026, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 7.96980619430542, | |
| "learning_rate": 0.00017002611405536413, | |
| "loss": 3.4413, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.538659572601318, | |
| "learning_rate": 0.00016989510679668194, | |
| "loss": 3.3497, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.8960394859313965, | |
| "learning_rate": 0.00016976386457555323, | |
| "loss": 3.3708, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.486491680145264, | |
| "learning_rate": 0.00016963238783316754, | |
| "loss": 3.4697, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.217641353607178, | |
| "learning_rate": 0.0001695006770115029, | |
| "loss": 3.4249, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.6906938552856445, | |
| "learning_rate": 0.00016936873255332413, | |
| "loss": 3.5343, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 6.275619983673096, | |
| "learning_rate": 0.00016923655490218149, | |
| "loss": 3.3991, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.75913667678833, | |
| "learning_rate": 0.00016910414450240917, | |
| "loss": 3.3861, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.206583499908447, | |
| "learning_rate": 0.0001689715017991237, | |
| "loss": 3.369, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.569302082061768, | |
| "learning_rate": 0.0001688386272382227, | |
| "loss": 3.4837, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.360637187957764, | |
| "learning_rate": 0.00016870552126638298, | |
| "loss": 3.3299, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.168808937072754, | |
| "learning_rate": 0.00016857218433105945, | |
| "loss": 3.3613, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.515918731689453, | |
| "learning_rate": 0.0001684386168804834, | |
| "loss": 3.159, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.885009765625, | |
| "learning_rate": 0.000168304819363661, | |
| "loss": 3.3029, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.174962997436523, | |
| "learning_rate": 0.0001681707922303718, | |
| "loss": 3.3289, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 4.86044454574585, | |
| "learning_rate": 0.0001680365359311673, | |
| "loss": 3.3132, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 6.267008304595947, | |
| "learning_rate": 0.00016790205091736935, | |
| "loss": 3.3649, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 6.221423625946045, | |
| "learning_rate": 0.00016776733764106862, | |
| "loss": 3.3311, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.069894790649414, | |
| "learning_rate": 0.00016763239655512318, | |
| "loss": 3.3157, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 8.425812721252441, | |
| "learning_rate": 0.00016749722811315688, | |
| "loss": 3.2714, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.80504846572876, | |
| "learning_rate": 0.00016736183276955783, | |
| "loss": 3.3274, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 7.617208003997803, | |
| "learning_rate": 0.00016722621097947697, | |
| "loss": 3.1857, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.363246440887451, | |
| "learning_rate": 0.00016709036319882646, | |
| "loss": 3.4673, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 6.790156364440918, | |
| "learning_rate": 0.00016695428988427807, | |
| "loss": 3.3016, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 4.9824981689453125, | |
| "learning_rate": 0.00016681799149326185, | |
| "loss": 3.4103, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 4.915642738342285, | |
| "learning_rate": 0.00016668146848396442, | |
| "loss": 3.4356, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 4.938210487365723, | |
| "learning_rate": 0.0001665447213153275, | |
| "loss": 3.299, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.116371154785156, | |
| "learning_rate": 0.00016640775044704634, | |
| "loss": 3.3231, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 6.796716213226318, | |
| "learning_rate": 0.0001662705563395682, | |
| "loss": 3.3685, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 9.196764945983887, | |
| "learning_rate": 0.0001661331394540908, | |
| "loss": 3.2807, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 4.64096736907959, | |
| "learning_rate": 0.00016599550025256076, | |
| "loss": 3.2909, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.615271091461182, | |
| "learning_rate": 0.000165857639197672, | |
| "loss": 3.2044, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 6.112679481506348, | |
| "learning_rate": 0.0001657195567528643, | |
| "loss": 3.2377, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 4.882411003112793, | |
| "learning_rate": 0.0001655812533823216, | |
| "loss": 3.4462, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 4.716900825500488, | |
| "learning_rate": 0.00016544272955097063, | |
| "loss": 3.3563, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.415688991546631, | |
| "learning_rate": 0.0001653039857244791, | |
| "loss": 3.2475, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.752101421356201, | |
| "learning_rate": 0.00016516502236925434, | |
| "loss": 3.3646, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 7.952321529388428, | |
| "learning_rate": 0.00016502583995244163, | |
| "loss": 3.2835, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.233190536499023, | |
| "learning_rate": 0.00016488643894192268, | |
| "loss": 3.3653, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.358859062194824, | |
| "learning_rate": 0.00016474681980631402, | |
| "loss": 3.2425, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.584342002868652, | |
| "learning_rate": 0.0001646069830149654, | |
| "loss": 3.3139, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 6.155908584594727, | |
| "learning_rate": 0.00016446692903795837, | |
| "loss": 3.2732, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 6.454020023345947, | |
| "learning_rate": 0.00016432665834610445, | |
| "loss": 3.2121, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.337937831878662, | |
| "learning_rate": 0.00016418617141094374, | |
| "loss": 3.3123, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 4.727194786071777, | |
| "learning_rate": 0.00016404546870474324, | |
| "loss": 3.2558, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.887204647064209, | |
| "learning_rate": 0.00016390455070049536, | |
| "loss": 3.377, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.2247724533081055, | |
| "learning_rate": 0.0001637634178719162, | |
| "loss": 3.1277, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.014094829559326, | |
| "learning_rate": 0.00016362207069344403, | |
| "loss": 3.142, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 7.58336067199707, | |
| "learning_rate": 0.00016348050964023773, | |
| "loss": 3.3156, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 4.806646823883057, | |
| "learning_rate": 0.00016333873518817514, | |
| "loss": 3.2814, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.387387275695801, | |
| "learning_rate": 0.00016319674781385143, | |
| "loss": 3.3977, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.4912638664245605, | |
| "learning_rate": 0.00016305454799457755, | |
| "loss": 3.2323, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 4.692640781402588, | |
| "learning_rate": 0.00016291213620837867, | |
| "loss": 3.3033, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.484092712402344, | |
| "learning_rate": 0.0001627695129339924, | |
| "loss": 3.1466, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.305532932281494, | |
| "learning_rate": 0.00016262667865086746, | |
| "loss": 3.4111, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.511746883392334, | |
| "learning_rate": 0.00016248363383916182, | |
| "loss": 3.2535, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 7.3723649978637695, | |
| "learning_rate": 0.00016234037897974108, | |
| "loss": 3.3265, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.085361003875732, | |
| "learning_rate": 0.0001621997858933184, | |
| "loss": 3.3003, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 4.863938331604004, | |
| "learning_rate": 0.000162056116560834, | |
| "loss": 3.2411, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.351004123687744, | |
| "learning_rate": 0.00016191223861779529, | |
| "loss": 3.2409, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 8.316920280456543, | |
| "learning_rate": 0.0001617681525478687, | |
| "loss": 3.0881, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 7.093586444854736, | |
| "learning_rate": 0.0001616238588354203, | |
| "loss": 3.2573, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 6.5853705406188965, | |
| "learning_rate": 0.00016147935796551405, | |
| "loss": 3.3215, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 4.667483329772949, | |
| "learning_rate": 0.00016133465042391046, | |
| "loss": 3.3032, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.287956237792969, | |
| "learning_rate": 0.00016118973669706468, | |
| "loss": 3.2255, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.676483631134033, | |
| "learning_rate": 0.0001610446172721251, | |
| "loss": 3.3663, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 6.739033222198486, | |
| "learning_rate": 0.00016089929263693144, | |
| "loss": 3.2537, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.158905982971191, | |
| "learning_rate": 0.00016075376328001344, | |
| "loss": 3.2336, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.2512526512146, | |
| "learning_rate": 0.00016060802969058885, | |
| "loss": 3.2982, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 8.520125389099121, | |
| "learning_rate": 0.00016046209235856212, | |
| "loss": 3.3153, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 6.461225509643555, | |
| "learning_rate": 0.00016031595177452257, | |
| "loss": 3.2629, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 6.117869853973389, | |
| "learning_rate": 0.00016016960842974278, | |
| "loss": 3.2225, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 4.035212993621826, | |
| "learning_rate": 0.00016002306281617692, | |
| "loss": 3.3866, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.271117687225342, | |
| "learning_rate": 0.00015987631542645913, | |
| "loss": 3.2602, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 4.860154151916504, | |
| "learning_rate": 0.00015972936675390185, | |
| "loss": 3.288, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.154600143432617, | |
| "learning_rate": 0.0001595822172924942, | |
| "loss": 3.1941, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 6.897374153137207, | |
| "learning_rate": 0.00015943486753690017, | |
| "loss": 3.2323, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 6.063130855560303, | |
| "learning_rate": 0.00015928731798245721, | |
| "loss": 3.1718, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.736262321472168, | |
| "learning_rate": 0.00015913956912517432, | |
| "loss": 3.3035, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.386317253112793, | |
| "learning_rate": 0.00015899162146173053, | |
| "loss": 3.2879, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.535543441772461, | |
| "learning_rate": 0.00015884347548947314, | |
| "loss": 3.2266, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 6.057496070861816, | |
| "learning_rate": 0.00015869513170641616, | |
| "loss": 3.1668, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.6912055015563965, | |
| "learning_rate": 0.00015854659061123854, | |
| "loss": 3.1562, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.563050270080566, | |
| "learning_rate": 0.0001583978527032825, | |
| "loss": 3.1819, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 6.75504732131958, | |
| "learning_rate": 0.0001582489184825519, | |
| "loss": 3.1891, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.390425205230713, | |
| "learning_rate": 0.00015809978844971053, | |
| "loss": 3.1856, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 6.183398246765137, | |
| "learning_rate": 0.0001579504631060804, | |
| "loss": 3.3115, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.40380859375, | |
| "learning_rate": 0.00015780094295364015, | |
| "loss": 3.162, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.078804016113281, | |
| "learning_rate": 0.00015765122849502325, | |
| "loss": 3.2046, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 6.183681964874268, | |
| "learning_rate": 0.00015750132023351638, | |
| "loss": 3.0689, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 12.730826377868652, | |
| "learning_rate": 0.00015735121867305768, | |
| "loss": 3.2468, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 6.127053260803223, | |
| "learning_rate": 0.00015720092431823515, | |
| "loss": 3.1628, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.8310089111328125, | |
| "learning_rate": 0.00015705043767428483, | |
| "loss": 3.2047, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 7.505776882171631, | |
| "learning_rate": 0.0001568997592470892, | |
| "loss": 3.2827, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.399072170257568, | |
| "learning_rate": 0.00015674888954317549, | |
| "loss": 3.1483, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.224669456481934, | |
| "learning_rate": 0.00015659782906971383, | |
| "loss": 3.2698, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 7.958742618560791, | |
| "learning_rate": 0.00015644657833451577, | |
| "loss": 3.0145, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 6.20373010635376, | |
| "learning_rate": 0.0001562981685120925, | |
| "loss": 3.1598, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.2080159187316895, | |
| "learning_rate": 0.00015614654255930347, | |
| "loss": 3.2801, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 4.195250988006592, | |
| "learning_rate": 0.00015599472786184245, | |
| "loss": 3.156, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.0389204025268555, | |
| "learning_rate": 0.00015584272493005642, | |
| "loss": 3.1345, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 7.003210544586182, | |
| "learning_rate": 0.00015569053427492505, | |
| "loss": 3.2186, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.47674036026001, | |
| "learning_rate": 0.00015553815640805907, | |
| "loss": 3.3211, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.2981648445129395, | |
| "learning_rate": 0.00015538559184169863, | |
| "loss": 3.2454, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.528575420379639, | |
| "learning_rate": 0.00015523284108871142, | |
| "loss": 3.1963, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.553009033203125, | |
| "learning_rate": 0.0001550799046625911, | |
| "loss": 3.1682, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.608404636383057, | |
| "learning_rate": 0.0001549267830774553, | |
| "loss": 3.1461, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 6.462625503540039, | |
| "learning_rate": 0.00015477347684804445, | |
| "loss": 3.2772, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.327962875366211, | |
| "learning_rate": 0.00015461998648971928, | |
| "loss": 3.2144, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 6.94124174118042, | |
| "learning_rate": 0.00015446631251845978, | |
| "loss": 3.2227, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.351782321929932, | |
| "learning_rate": 0.00015431245545086307, | |
| "loss": 3.2687, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 4.562844276428223, | |
| "learning_rate": 0.00015415841580414185, | |
| "loss": 3.1332, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.023700714111328, | |
| "learning_rate": 0.00015400419409612243, | |
| "loss": 3.2272, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.127398490905762, | |
| "learning_rate": 0.0001538497908452433, | |
| "loss": 3.2843, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.577905178070068, | |
| "learning_rate": 0.0001536952065705532, | |
| "loss": 3.2635, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 6.119299411773682, | |
| "learning_rate": 0.00015354044179170933, | |
| "loss": 3.126, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 4.827983856201172, | |
| "learning_rate": 0.0001533854970289758, | |
| "loss": 3.2345, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 4.499656677246094, | |
| "learning_rate": 0.00015323037280322166, | |
| "loss": 3.0808, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.260239601135254, | |
| "learning_rate": 0.00015307506963591923, | |
| "loss": 3.1234, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.486075401306152, | |
| "learning_rate": 0.00015291958804914256, | |
| "loss": 3.1769, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 4.572110176086426, | |
| "learning_rate": 0.00015276392856556527, | |
| "loss": 3.2166, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.288125991821289, | |
| "learning_rate": 0.0001526080917084591, | |
| "loss": 3.0781, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.380829334259033, | |
| "learning_rate": 0.000152452078001692, | |
| "loss": 3.1178, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.039462566375732, | |
| "learning_rate": 0.00015229588796972652, | |
| "loss": 3.2808, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 7.323626518249512, | |
| "learning_rate": 0.00015213952213761787, | |
| "loss": 3.1391, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.912395000457764, | |
| "learning_rate": 0.00015198298103101228, | |
| "loss": 3.1744, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.694441795349121, | |
| "learning_rate": 0.00015182626517614518, | |
| "loss": 3.0576, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.467188358306885, | |
| "learning_rate": 0.00015166937509983943, | |
| "loss": 3.2361, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.292226791381836, | |
| "learning_rate": 0.00015151231132950357, | |
| "loss": 3.1376, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.369929313659668, | |
| "learning_rate": 0.00015135507439313005, | |
| "loss": 3.1406, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.252573013305664, | |
| "learning_rate": 0.00015119766481929342, | |
| "loss": 3.123, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.9053730964660645, | |
| "learning_rate": 0.00015104008313714858, | |
| "loss": 3.0018, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.080839157104492, | |
| "learning_rate": 0.00015088232987642898, | |
| "loss": 3.2106, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.92653226852417, | |
| "learning_rate": 0.00015072440556744492, | |
| "loss": 3.2095, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.627429008483887, | |
| "learning_rate": 0.00015056631074108166, | |
| "loss": 3.089, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.694194793701172, | |
| "learning_rate": 0.00015040804592879762, | |
| "loss": 3.0885, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.04107666015625, | |
| "learning_rate": 0.00015024961166262276, | |
| "loss": 3.0906, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.280002117156982, | |
| "learning_rate": 0.0001500910084751567, | |
| "loss": 3.2142, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 4.803068161010742, | |
| "learning_rate": 0.00014993223689956672, | |
| "loss": 3.2014, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.611780643463135, | |
| "learning_rate": 0.00014977329746958636, | |
| "loss": 3.2491, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.6669020652771, | |
| "learning_rate": 0.00014961737449079314, | |
| "loss": 3.2, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.894138336181641, | |
| "learning_rate": 0.00014945810428594703, | |
| "loss": 3.0321, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 6.354518413543701, | |
| "learning_rate": 0.0001492986678205755, | |
| "loss": 3.1314, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 4.786489009857178, | |
| "learning_rate": 0.00014913906563064706, | |
| "loss": 3.1937, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 8.759417533874512, | |
| "learning_rate": 0.00014897929825268745, | |
| "loss": 3.1069, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.354910850524902, | |
| "learning_rate": 0.00014881936622377766, | |
| "loss": 3.1519, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.169478416442871, | |
| "learning_rate": 0.0001486592700815522, | |
| "loss": 3.1414, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 6.09418249130249, | |
| "learning_rate": 0.00014849901036419723, | |
| "loss": 3.0954, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 4.986037731170654, | |
| "learning_rate": 0.00014833858761044883, | |
| "loss": 3.2445, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 12.803654670715332, | |
| "learning_rate": 0.00014817800235959118, | |
| "loss": 3.0699, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 6.191990852355957, | |
| "learning_rate": 0.00014801725515145467, | |
| "loss": 3.2574, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.233778476715088, | |
| "learning_rate": 0.00014785634652641412, | |
| "loss": 3.1152, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 6.065474987030029, | |
| "learning_rate": 0.000147695277025387, | |
| "loss": 3.1178, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 5.4664435386657715, | |
| "learning_rate": 0.00014753404718983158, | |
| "loss": 3.0627, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 4.926270484924316, | |
| "learning_rate": 0.00014737265756174515, | |
| "loss": 3.0182, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 6.013931751251221, | |
| "learning_rate": 0.0001472111086836621, | |
| "loss": 3.0801, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 6.25607442855835, | |
| "learning_rate": 0.00014704940109865224, | |
| "loss": 3.1227, | |
| "step": 34000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 90183, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "total_flos": 1.2226077917184e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |