| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.510073960724305, | |
| "eval_steps": 500, | |
| "global_step": 46000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 3.2580924034118652, | |
| "learning_rate": 2.2172949002217296e-06, | |
| "loss": 10.2933, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.4347386360168457, | |
| "learning_rate": 4.434589800443459e-06, | |
| "loss": 10.1894, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.3895885944366455, | |
| "learning_rate": 6.651884700665188e-06, | |
| "loss": 10.1424, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.129647731781006, | |
| "learning_rate": 8.869179600886918e-06, | |
| "loss": 10.0995, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.3564186096191406, | |
| "learning_rate": 1.1086474501108649e-05, | |
| "loss": 10.0479, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 1.830551028251648, | |
| "learning_rate": 1.3303769401330377e-05, | |
| "loss": 9.9971, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.1173911094665527, | |
| "learning_rate": 1.5521064301552106e-05, | |
| "loss": 9.9201, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 1.6636557579040527, | |
| "learning_rate": 1.7738359201773837e-05, | |
| "loss": 9.8562, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 2.4503839015960693, | |
| "learning_rate": 1.9955654101995567e-05, | |
| "loss": 9.7599, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.822424054145813, | |
| "learning_rate": 2.2172949002217298e-05, | |
| "loss": 9.6608, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.6598998308181763, | |
| "learning_rate": 2.4390243902439026e-05, | |
| "loss": 9.55, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.8471707105636597, | |
| "learning_rate": 2.6607538802660753e-05, | |
| "loss": 9.4606, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.4833533763885498, | |
| "learning_rate": 2.8824833702882487e-05, | |
| "loss": 9.3283, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.688541054725647, | |
| "learning_rate": 3.104212860310421e-05, | |
| "loss": 9.2229, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.6466543674468994, | |
| "learning_rate": 3.325942350332594e-05, | |
| "loss": 9.1093, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.4169293642044067, | |
| "learning_rate": 3.547671840354767e-05, | |
| "loss": 8.9703, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.7079193592071533, | |
| "learning_rate": 3.7694013303769404e-05, | |
| "loss": 8.8351, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.5513204336166382, | |
| "learning_rate": 3.9911308203991135e-05, | |
| "loss": 8.7111, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.485573172569275, | |
| "learning_rate": 4.212860310421286e-05, | |
| "loss": 8.5627, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.511690616607666, | |
| "learning_rate": 4.4345898004434597e-05, | |
| "loss": 8.5042, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 2.1478614807128906, | |
| "learning_rate": 4.656319290465632e-05, | |
| "loss": 8.3287, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.4060652256011963, | |
| "learning_rate": 4.878048780487805e-05, | |
| "loss": 8.2341, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.3950035572052002, | |
| "learning_rate": 5.099778270509978e-05, | |
| "loss": 8.1277, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.5197688341140747, | |
| "learning_rate": 5.3215077605321506e-05, | |
| "loss": 8.0311, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.3406693935394287, | |
| "learning_rate": 5.543237250554324e-05, | |
| "loss": 7.9824, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.4520119428634644, | |
| "learning_rate": 5.7649667405764975e-05, | |
| "loss": 7.9948, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.179124116897583, | |
| "learning_rate": 5.98669623059867e-05, | |
| "loss": 7.9144, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.4039533138275146, | |
| "learning_rate": 6.208425720620842e-05, | |
| "loss": 7.8768, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.5542700290679932, | |
| "learning_rate": 6.430155210643016e-05, | |
| "loss": 7.894, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.4150550365447998, | |
| "learning_rate": 6.651884700665188e-05, | |
| "loss": 7.8409, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.6647827625274658, | |
| "learning_rate": 6.873614190687362e-05, | |
| "loss": 7.91, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.7795697450637817, | |
| "learning_rate": 7.095343680709535e-05, | |
| "loss": 7.8256, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.933110237121582, | |
| "learning_rate": 7.317073170731707e-05, | |
| "loss": 7.8463, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.1942570209503174, | |
| "learning_rate": 7.538802660753881e-05, | |
| "loss": 7.7827, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.6759297847747803, | |
| "learning_rate": 7.760532150776053e-05, | |
| "loss": 7.8, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.093256950378418, | |
| "learning_rate": 7.982261640798227e-05, | |
| "loss": 7.7461, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.567872166633606, | |
| "learning_rate": 8.2039911308204e-05, | |
| "loss": 7.7338, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.3017679452896118, | |
| "learning_rate": 8.425720620842572e-05, | |
| "loss": 7.804, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.7510960102081299, | |
| "learning_rate": 8.647450110864746e-05, | |
| "loss": 7.7405, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.7215120792388916, | |
| "learning_rate": 8.869179600886919e-05, | |
| "loss": 7.7429, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.6202715635299683, | |
| "learning_rate": 9.090909090909092e-05, | |
| "loss": 7.6588, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.5680756568908691, | |
| "learning_rate": 9.312638580931264e-05, | |
| "loss": 7.6224, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.462240219116211, | |
| "learning_rate": 9.534368070953438e-05, | |
| "loss": 7.6851, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 2.2018320560455322, | |
| "learning_rate": 9.75609756097561e-05, | |
| "loss": 7.6443, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.9520208835601807, | |
| "learning_rate": 9.977827050997783e-05, | |
| "loss": 7.6456, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.115421175956726, | |
| "learning_rate": 0.00010199556541019956, | |
| "loss": 7.5894, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.6002250909805298, | |
| "learning_rate": 0.0001042128603104213, | |
| "loss": 7.6017, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.6516796350479126, | |
| "learning_rate": 0.00010643015521064301, | |
| "loss": 7.4548, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.2168257236480713, | |
| "learning_rate": 0.00010864745011086475, | |
| "loss": 7.5867, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.5447593927383423, | |
| "learning_rate": 0.00011086474501108647, | |
| "loss": 7.5317, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.6840906143188477, | |
| "learning_rate": 0.00011308203991130821, | |
| "loss": 7.5127, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.2965503931045532, | |
| "learning_rate": 0.00011529933481152995, | |
| "loss": 7.4911, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.643584966659546, | |
| "learning_rate": 0.00011751662971175166, | |
| "loss": 7.4416, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.5419111251831055, | |
| "learning_rate": 0.0001197339246119734, | |
| "loss": 7.4944, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.7774205207824707, | |
| "learning_rate": 0.00012195121951219512, | |
| "loss": 7.4244, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.1709322929382324, | |
| "learning_rate": 0.00012416851441241685, | |
| "loss": 7.371, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.5503411293029785, | |
| "learning_rate": 0.0001263858093126386, | |
| "loss": 7.3031, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.7744035720825195, | |
| "learning_rate": 0.00012860310421286032, | |
| "loss": 7.3338, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.2014000415802, | |
| "learning_rate": 0.00013082039911308205, | |
| "loss": 7.2962, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.6716220378875732, | |
| "learning_rate": 0.00013303769401330377, | |
| "loss": 7.3348, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.7045074701309204, | |
| "learning_rate": 0.0001352549889135255, | |
| "loss": 7.2864, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.8933771848678589, | |
| "learning_rate": 0.00013747228381374724, | |
| "loss": 7.2744, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 2.298779249191284, | |
| "learning_rate": 0.00013968957871396897, | |
| "loss": 7.2472, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.3420922756195068, | |
| "learning_rate": 0.0001419068736141907, | |
| "loss": 7.3019, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.9339039325714111, | |
| "learning_rate": 0.00014412416851441242, | |
| "loss": 7.2982, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.69667387008667, | |
| "learning_rate": 0.00014634146341463414, | |
| "loss": 7.2851, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.3124189376831055, | |
| "learning_rate": 0.0001485587583148559, | |
| "loss": 7.258, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.975651741027832, | |
| "learning_rate": 0.00015077605321507762, | |
| "loss": 7.1275, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.9704022407531738, | |
| "learning_rate": 0.00015299334811529934, | |
| "loss": 7.1473, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.5047757625579834, | |
| "learning_rate": 0.00015521064301552106, | |
| "loss": 7.1096, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.5465894937515259, | |
| "learning_rate": 0.0001574279379157428, | |
| "loss": 7.1501, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.9557933807373047, | |
| "learning_rate": 0.00015964523281596454, | |
| "loss": 7.2033, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.420116424560547, | |
| "learning_rate": 0.00016186252771618626, | |
| "loss": 7.1275, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.114737033843994, | |
| "learning_rate": 0.000164079822616408, | |
| "loss": 7.0932, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.3085389137268066, | |
| "learning_rate": 0.00016629711751662974, | |
| "loss": 7.0311, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.5679140090942383, | |
| "learning_rate": 0.00016851441241685144, | |
| "loss": 6.9168, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.8611838817596436, | |
| "learning_rate": 0.0001707317073170732, | |
| "loss": 7.0085, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.8603994846343994, | |
| "learning_rate": 0.0001729490022172949, | |
| "loss": 6.9432, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.4244627952575684, | |
| "learning_rate": 0.00017516629711751663, | |
| "loss": 6.9333, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.177870750427246, | |
| "learning_rate": 0.00017738359201773839, | |
| "loss": 6.9499, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.9320554733276367, | |
| "learning_rate": 0.00017960088691796008, | |
| "loss": 6.8204, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.5062849521636963, | |
| "learning_rate": 0.00018181818181818183, | |
| "loss": 6.9505, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.9272422790527344, | |
| "learning_rate": 0.00018403547671840356, | |
| "loss": 6.8701, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.0309596061706543, | |
| "learning_rate": 0.00018625277161862528, | |
| "loss": 6.924, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.0265886783599854, | |
| "learning_rate": 0.00018847006651884703, | |
| "loss": 6.9223, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.5160486698150635, | |
| "learning_rate": 0.00019068736141906876, | |
| "loss": 6.8708, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.613301992416382, | |
| "learning_rate": 0.00019290465631929045, | |
| "loss": 6.8937, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.3031229972839355, | |
| "learning_rate": 0.0001951219512195122, | |
| "loss": 6.8337, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.54779052734375, | |
| "learning_rate": 0.00019733924611973393, | |
| "loss": 6.8334, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.8277971744537354, | |
| "learning_rate": 0.00019955654101995565, | |
| "loss": 6.7925, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.0113885402679443, | |
| "learning_rate": 0.00019999989242739025, | |
| "loss": 6.8458, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.2395377159118652, | |
| "learning_rate": 0.00019999945541405976, | |
| "loss": 6.6251, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.445993423461914, | |
| "learning_rate": 0.0001999986822381884, | |
| "loss": 6.8099, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 4.077752590179443, | |
| "learning_rate": 0.0001999975729023753, | |
| "loss": 6.8053, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 3.167569875717163, | |
| "learning_rate": 0.00019999612741034963, | |
| "loss": 6.7706, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.893659234046936, | |
| "learning_rate": 0.00019999434576697066, | |
| "loss": 6.8245, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 3.6101326942443848, | |
| "learning_rate": 0.00019999222797822762, | |
| "loss": 6.7407, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.2858726978302, | |
| "learning_rate": 0.00019998977405123974, | |
| "loss": 6.74, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.9325459003448486, | |
| "learning_rate": 0.0001999869839942563, | |
| "loss": 6.716, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.0043437480926514, | |
| "learning_rate": 0.00019998385781665643, | |
| "loss": 6.6003, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 4.151523113250732, | |
| "learning_rate": 0.00019998039552894924, | |
| "loss": 6.6801, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.8407771587371826, | |
| "learning_rate": 0.00019997659714277372, | |
| "loss": 6.608, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.230713129043579, | |
| "learning_rate": 0.00019997246267089867, | |
| "loss": 6.6479, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.2546942234039307, | |
| "learning_rate": 0.0001999679921272227, | |
| "loss": 6.6548, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.180986166000366, | |
| "learning_rate": 0.00019996318552677425, | |
| "loss": 6.6851, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.341231346130371, | |
| "learning_rate": 0.00019995804288571134, | |
| "loss": 6.547, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.1117124557495117, | |
| "learning_rate": 0.00019995256422132172, | |
| "loss": 6.7072, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.0082530975341797, | |
| "learning_rate": 0.0001999467495520227, | |
| "loss": 6.5422, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.409489870071411, | |
| "learning_rate": 0.0001999405988973611, | |
| "loss": 6.3716, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.649052381515503, | |
| "learning_rate": 0.00019993411227801328, | |
| "loss": 6.6434, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.081116199493408, | |
| "learning_rate": 0.00019992728971578492, | |
| "loss": 6.4624, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.1578280925750732, | |
| "learning_rate": 0.00019992013123361102, | |
| "loss": 6.5416, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.7874557971954346, | |
| "learning_rate": 0.0001999126368555559, | |
| "loss": 6.4512, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.7693099975585938, | |
| "learning_rate": 0.00019990480660681293, | |
| "loss": 6.5105, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.4338185787200928, | |
| "learning_rate": 0.00019989680712666593, | |
| "loss": 6.5092, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 3.656937837600708, | |
| "learning_rate": 0.00019988831193270577, | |
| "loss": 6.4269, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.857292652130127, | |
| "learning_rate": 0.00019987948094982952, | |
| "loss": 6.4387, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.4963467121124268, | |
| "learning_rate": 0.00019987031420772385, | |
| "loss": 6.3851, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.602522611618042, | |
| "learning_rate": 0.00019986081173720396, | |
| "loss": 6.3413, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.6455273628234863, | |
| "learning_rate": 0.00019985097357021385, | |
| "loss": 6.2965, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.5592167377471924, | |
| "learning_rate": 0.0001998407997398259, | |
| "loss": 6.4293, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.6016533374786377, | |
| "learning_rate": 0.00019983029028024094, | |
| "loss": 6.2897, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.5536839962005615, | |
| "learning_rate": 0.000199819445226788, | |
| "loss": 6.3157, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.0514349937438965, | |
| "learning_rate": 0.00019980826461592427, | |
| "loss": 6.3847, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.72495174407959, | |
| "learning_rate": 0.00019979674848523505, | |
| "loss": 6.3517, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.4264872074127197, | |
| "learning_rate": 0.00019978489687343335, | |
| "loss": 6.2533, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.8361423015594482, | |
| "learning_rate": 0.0001997727098203602, | |
| "loss": 6.3654, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.9690892696380615, | |
| "learning_rate": 0.00019976018736698404, | |
| "loss": 6.3968, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.6132867336273193, | |
| "learning_rate": 0.0001997473295554009, | |
| "loss": 6.3444, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 4.820697784423828, | |
| "learning_rate": 0.00019973413642883424, | |
| "loss": 6.2019, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.2316782474517822, | |
| "learning_rate": 0.00019972060803163458, | |
| "loss": 6.2049, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 3.9528305530548096, | |
| "learning_rate": 0.00019970674440927957, | |
| "loss": 6.1718, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.891073226928711, | |
| "learning_rate": 0.0001996925456083738, | |
| "loss": 6.2393, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.813270092010498, | |
| "learning_rate": 0.00019967801167664853, | |
| "loss": 6.2116, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.2726826667785645, | |
| "learning_rate": 0.00019966314266296173, | |
| "loss": 6.1521, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.3895318508148193, | |
| "learning_rate": 0.00019964793861729772, | |
| "loss": 6.1072, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.190431833267212, | |
| "learning_rate": 0.000199632399590767, | |
| "loss": 6.2009, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.79266095161438, | |
| "learning_rate": 0.00019961652563560634, | |
| "loss": 6.028, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.260039806365967, | |
| "learning_rate": 0.00019960031680517826, | |
| "loss": 6.0733, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.0739686489105225, | |
| "learning_rate": 0.0001995837731539711, | |
| "loss": 6.0521, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.0517771244049072, | |
| "learning_rate": 0.00019956689473759872, | |
| "loss": 6.0544, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.9524648189544678, | |
| "learning_rate": 0.0001995496816128003, | |
| "loss": 6.1326, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.498497486114502, | |
| "learning_rate": 0.00019953213383744033, | |
| "loss": 6.236, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.157576084136963, | |
| "learning_rate": 0.00019951425147050807, | |
| "loss": 5.9898, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.9297516345977783, | |
| "learning_rate": 0.00019949603457211775, | |
| "loss": 6.086, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.3214786052703857, | |
| "learning_rate": 0.00019947748320350804, | |
| "loss": 5.9589, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.8847291469573975, | |
| "learning_rate": 0.00019945859742704201, | |
| "loss": 6.1931, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.387896776199341, | |
| "learning_rate": 0.00019943937730620702, | |
| "loss": 6.0539, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.1214797496795654, | |
| "learning_rate": 0.00019941982290561417, | |
| "loss": 6.0288, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.7995123863220215, | |
| "learning_rate": 0.00019939993429099841, | |
| "loss": 6.0526, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.788393974304199, | |
| "learning_rate": 0.00019937971152921818, | |
| "loss": 5.9799, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.009220123291016, | |
| "learning_rate": 0.0001993591546882552, | |
| "loss": 6.1223, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.5576276779174805, | |
| "learning_rate": 0.00019933826383721428, | |
| "loss": 5.989, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.1287412643432617, | |
| "learning_rate": 0.00019931703904632294, | |
| "loss": 6.0542, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.6518595218658447, | |
| "learning_rate": 0.00019929548038693146, | |
| "loss": 6.041, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.268080472946167, | |
| "learning_rate": 0.0001992735879315123, | |
| "loss": 5.888, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.6055593490600586, | |
| "learning_rate": 0.00019925136175366007, | |
| "loss": 5.913, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 4.866463661193848, | |
| "learning_rate": 0.00019922880192809137, | |
| "loss": 5.9858, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.44808292388916, | |
| "learning_rate": 0.00019920590853064423, | |
| "loss": 5.7686, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 2.9507765769958496, | |
| "learning_rate": 0.00019918268163827808, | |
| "loss": 5.8557, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.441870927810669, | |
| "learning_rate": 0.00019915912132907352, | |
| "loss": 5.8268, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.838809013366699, | |
| "learning_rate": 0.00019913522768223182, | |
| "loss": 5.9833, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 4.165487289428711, | |
| "learning_rate": 0.00019911100077807498, | |
| "loss": 5.7422, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.5947463512420654, | |
| "learning_rate": 0.0001990864406980452, | |
| "loss": 5.7479, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 4.130446434020996, | |
| "learning_rate": 0.00019906154752470472, | |
| "loss": 5.7767, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 4.866550922393799, | |
| "learning_rate": 0.00019903632134173554, | |
| "loss": 5.7681, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.2839725017547607, | |
| "learning_rate": 0.00019901076223393903, | |
| "loss": 5.6656, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.0762476921081543, | |
| "learning_rate": 0.0001989848702872359, | |
| "loss": 5.789, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.7109107971191406, | |
| "learning_rate": 0.00019895864558866556, | |
| "loss": 5.773, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 5.400998115539551, | |
| "learning_rate": 0.00019893208822638618, | |
| "loss": 5.7506, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.3062849044799805, | |
| "learning_rate": 0.00019890519828967413, | |
| "loss": 5.7515, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.109920501708984, | |
| "learning_rate": 0.00019887797586892373, | |
| "loss": 5.7972, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.4838390350341797, | |
| "learning_rate": 0.00019885042105564717, | |
| "loss": 5.6753, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.251760959625244, | |
| "learning_rate": 0.00019882253394247381, | |
| "loss": 5.6303, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.042376518249512, | |
| "learning_rate": 0.00019879431462315025, | |
| "loss": 5.5753, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.239652633666992, | |
| "learning_rate": 0.0001987657631925398, | |
| "loss": 5.5335, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 5.15481424331665, | |
| "learning_rate": 0.00019873687974662215, | |
| "loss": 5.5396, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.36835241317749, | |
| "learning_rate": 0.00019870766438249317, | |
| "loss": 5.6017, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.165258407592773, | |
| "learning_rate": 0.00019867811719836452, | |
| "loss": 5.7228, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.125988006591797, | |
| "learning_rate": 0.0001986482382935633, | |
| "loss": 5.5787, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.177731037139893, | |
| "learning_rate": 0.0001986180277685317, | |
| "loss": 5.5829, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 5.006561279296875, | |
| "learning_rate": 0.00019858748572482683, | |
| "loss": 5.5466, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.33070182800293, | |
| "learning_rate": 0.00019855661226512007, | |
| "loss": 5.5544, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.358560085296631, | |
| "learning_rate": 0.00019852540749319708, | |
| "loss": 5.4599, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.536096096038818, | |
| "learning_rate": 0.00019849387151395708, | |
| "loss": 5.4983, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.66163444519043, | |
| "learning_rate": 0.0001984620044334129, | |
| "loss": 5.4097, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.4319233894348145, | |
| "learning_rate": 0.00019842980635869024, | |
| "loss": 5.4093, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.98419713973999, | |
| "learning_rate": 0.0001983972773980276, | |
| "loss": 5.4056, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.6354339122772217, | |
| "learning_rate": 0.0001983644176607757, | |
| "loss": 5.3171, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.495342254638672, | |
| "learning_rate": 0.00019833122725739736, | |
| "loss": 5.4521, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.5558671951293945, | |
| "learning_rate": 0.00019829770629946678, | |
| "loss": 5.5158, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.7165732383728027, | |
| "learning_rate": 0.00019826385489966957, | |
| "loss": 5.301, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 6.030915260314941, | |
| "learning_rate": 0.00019822967317180204, | |
| "loss": 5.3316, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 5.385923385620117, | |
| "learning_rate": 0.00019819516123077094, | |
| "loss": 5.3844, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.383516788482666, | |
| "learning_rate": 0.00019816101926755305, | |
| "loss": 5.2995, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.446406364440918, | |
| "learning_rate": 0.00019812585384780055, | |
| "loss": 5.386, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.345483303070068, | |
| "learning_rate": 0.00019809035856388805, | |
| "loss": 5.2815, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.791261672973633, | |
| "learning_rate": 0.00019805453353513813, | |
| "loss": 5.3757, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 5.622151851654053, | |
| "learning_rate": 0.00019801837888198172, | |
| "loss": 5.4405, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.934606075286865, | |
| "learning_rate": 0.0001979818947259579, | |
| "loss": 5.139, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.9659693241119385, | |
| "learning_rate": 0.0001979450811897134, | |
| "loss": 5.1726, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 5.214992046356201, | |
| "learning_rate": 0.00019790793839700226, | |
| "loss": 5.2864, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.5359601974487305, | |
| "learning_rate": 0.00019787046647268524, | |
| "loss": 5.1443, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.26462984085083, | |
| "learning_rate": 0.00019783266554272962, | |
| "loss": 5.0597, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 5.053945064544678, | |
| "learning_rate": 0.00019779453573420873, | |
| "loss": 5.2946, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 6.082211494445801, | |
| "learning_rate": 0.00019775607717530127, | |
| "loss": 5.2075, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.107390403747559, | |
| "learning_rate": 0.00019771728999529132, | |
| "loss": 5.1394, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.58411169052124, | |
| "learning_rate": 0.00019767817432456752, | |
| "loss": 5.1064, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 8.38965892791748, | |
| "learning_rate": 0.00019763952239228627, | |
| "loss": 5.0808, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.885803699493408, | |
| "learning_rate": 0.00019759975669894338, | |
| "loss": 5.0664, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.1605916023254395, | |
| "learning_rate": 0.00019755966290999167, | |
| "loss": 5.2469, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.821887016296387, | |
| "learning_rate": 0.00019751924116021225, | |
| "loss": 5.2451, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.865694761276245, | |
| "learning_rate": 0.00019747849158548858, | |
| "loss": 5.2334, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.640681028366089, | |
| "learning_rate": 0.00019743741432280625, | |
| "loss": 5.1206, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.04166316986084, | |
| "learning_rate": 0.00019739600951025236, | |
| "loss": 5.0059, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.637605667114258, | |
| "learning_rate": 0.00019735427728701516, | |
| "loss": 5.0302, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.08723783493042, | |
| "learning_rate": 0.0001973122177933835, | |
| "loss": 5.1551, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.7944953441619873, | |
| "learning_rate": 0.00019726983117074643, | |
| "loss": 5.0665, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 5.2847371101379395, | |
| "learning_rate": 0.00019722711756159266, | |
| "loss": 5.2212, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.109150409698486, | |
| "learning_rate": 0.00019718407710951012, | |
| "loss": 5.2645, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.127768039703369, | |
| "learning_rate": 0.0001971407099591855, | |
| "loss": 5.0395, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 5.058667182922363, | |
| "learning_rate": 0.00019709701625640367, | |
| "loss": 5.0247, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 5.4407267570495605, | |
| "learning_rate": 0.00019705299614804732, | |
| "loss": 4.9935, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.7877707481384277, | |
| "learning_rate": 0.00019700864978209636, | |
| "loss": 5.074, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.777330160140991, | |
| "learning_rate": 0.00019696397730762746, | |
| "loss": 5.0458, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.143067836761475, | |
| "learning_rate": 0.0001969189788748136, | |
| "loss": 4.9375, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 5.560107231140137, | |
| "learning_rate": 0.00019687365463492344, | |
| "loss": 4.8285, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.057905197143555, | |
| "learning_rate": 0.00019682800474032095, | |
| "loss": 4.9753, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.835442066192627, | |
| "learning_rate": 0.00019678202934446482, | |
| "loss": 4.9368, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 5.135551929473877, | |
| "learning_rate": 0.0001967357286019079, | |
| "loss": 4.9994, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.615053653717041, | |
| "learning_rate": 0.00019668910266829685, | |
| "loss": 5.0182, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.474258899688721, | |
| "learning_rate": 0.0001966421517003714, | |
| "loss": 4.8704, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.264945030212402, | |
| "learning_rate": 0.00019659487585596406, | |
| "loss": 4.9076, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.091209411621094, | |
| "learning_rate": 0.00019654727529399925, | |
| "loss": 4.7135, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.154038429260254, | |
| "learning_rate": 0.00019649935017449318, | |
| "loss": 4.8239, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.697162628173828, | |
| "learning_rate": 0.00019645110065855305, | |
| "loss": 4.9972, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.0024847984313965, | |
| "learning_rate": 0.00019640252690837645, | |
| "loss": 4.8854, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.9416885375976562, | |
| "learning_rate": 0.0001963536290872511, | |
| "loss": 4.8547, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.978651285171509, | |
| "learning_rate": 0.000196304407359554, | |
| "loss": 4.7873, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.435175895690918, | |
| "learning_rate": 0.0001962548618907511, | |
| "loss": 4.8124, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.8776824474334717, | |
| "learning_rate": 0.00019620499284739662, | |
| "loss": 4.8896, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 5.041496276855469, | |
| "learning_rate": 0.00019615480039713248, | |
| "loss": 4.8343, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.18281888961792, | |
| "learning_rate": 0.00019610428470868784, | |
| "loss": 4.8559, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.223630905151367, | |
| "learning_rate": 0.00019605344595187844, | |
| "loss": 4.8153, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.63677453994751, | |
| "learning_rate": 0.0001960022842976061, | |
| "loss": 4.7951, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.188296794891357, | |
| "learning_rate": 0.00019595079991785802, | |
| "loss": 4.8904, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.402559280395508, | |
| "learning_rate": 0.00019589899298570634, | |
| "loss": 4.7851, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.976877212524414, | |
| "learning_rate": 0.00019584686367530755, | |
| "loss": 4.6431, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.849298477172852, | |
| "learning_rate": 0.0001957944121619018, | |
| "loss": 4.7544, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.932714462280273, | |
| "learning_rate": 0.0001957416386218124, | |
| "loss": 4.6811, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.682474136352539, | |
| "learning_rate": 0.00019568854323244515, | |
| "loss": 4.799, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.228520393371582, | |
| "learning_rate": 0.00019563619766470511, | |
| "loss": 4.7622, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.093870162963867, | |
| "learning_rate": 0.00019558246554138458, | |
| "loss": 4.7369, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.248356342315674, | |
| "learning_rate": 0.0001955284121038694, | |
| "loss": 4.7519, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 3.924299955368042, | |
| "learning_rate": 0.00019547403753386803, | |
| "loss": 4.6441, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.972569942474365, | |
| "learning_rate": 0.00019542043906868188, | |
| "loss": 4.7192, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.033604145050049, | |
| "learning_rate": 0.00019536542919665846, | |
| "loss": 4.6397, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.222695350646973, | |
| "learning_rate": 0.00019531009874003928, | |
| "loss": 4.6309, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 3.810999631881714, | |
| "learning_rate": 0.00019525444788482562, | |
| "loss": 4.6513, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.272600173950195, | |
| "learning_rate": 0.00019519847681809585, | |
| "loss": 4.8001, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.836308002471924, | |
| "learning_rate": 0.00019514218572800468, | |
| "loss": 4.7101, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.598148345947266, | |
| "learning_rate": 0.00019508557480378276, | |
| "loss": 4.5578, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.910820722579956, | |
| "learning_rate": 0.0001950286442357358, | |
| "loss": 4.7124, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.856081962585449, | |
| "learning_rate": 0.00019497139421524416, | |
| "loss": 4.7563, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.151907920837402, | |
| "learning_rate": 0.00019491382493476195, | |
| "loss": 4.6726, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.349935054779053, | |
| "learning_rate": 0.0001948559365878166, | |
| "loss": 4.6341, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.8229756355285645, | |
| "learning_rate": 0.00019479772936900811, | |
| "loss": 4.6183, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.495506286621094, | |
| "learning_rate": 0.0001947392034740084, | |
| "loss": 4.6608, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.307513236999512, | |
| "learning_rate": 0.00019468035909956072, | |
| "loss": 4.6805, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.939659595489502, | |
| "learning_rate": 0.0001946211964434788, | |
| "loss": 4.679, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.444967269897461, | |
| "learning_rate": 0.00019456171570464653, | |
| "loss": 4.7195, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.513270854949951, | |
| "learning_rate": 0.00019450191708301687, | |
| "loss": 4.5367, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.617405414581299, | |
| "learning_rate": 0.00019444180077961146, | |
| "loss": 4.5742, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.580646991729736, | |
| "learning_rate": 0.00019438136699652001, | |
| "loss": 4.4936, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.657532691955566, | |
| "learning_rate": 0.00019432061593689927, | |
| "loss": 4.6877, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.374803066253662, | |
| "learning_rate": 0.0001942595478049727, | |
| "loss": 4.6101, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.1111650466918945, | |
| "learning_rate": 0.00019419816280602962, | |
| "loss": 4.6185, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 5.18306303024292, | |
| "learning_rate": 0.00019413646114642446, | |
| "loss": 4.5524, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 4.411191463470459, | |
| "learning_rate": 0.00019407444303357624, | |
| "loss": 4.4346, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.161925792694092, | |
| "learning_rate": 0.0001940121086759678, | |
| "loss": 4.3702, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.059813022613525, | |
| "learning_rate": 0.000193949458283145, | |
| "loss": 4.5351, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.563150882720947, | |
| "learning_rate": 0.00019388649206571616, | |
| "loss": 4.477, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.1144609451293945, | |
| "learning_rate": 0.00019382321023535127, | |
| "loss": 4.6033, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.734794616699219, | |
| "learning_rate": 0.00019375961300478127, | |
| "loss": 4.5287, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.543684959411621, | |
| "learning_rate": 0.00019369570058779743, | |
| "loss": 4.4474, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.4647979736328125, | |
| "learning_rate": 0.00019363147319925047, | |
| "loss": 4.3806, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.058681964874268, | |
| "learning_rate": 0.00019356693105505006, | |
| "loss": 4.4998, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.494804859161377, | |
| "learning_rate": 0.00019350207437216386, | |
| "loss": 4.3911, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.227470397949219, | |
| "learning_rate": 0.00019343690336861687, | |
| "loss": 4.2557, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 3.7686829566955566, | |
| "learning_rate": 0.00019337141826349092, | |
| "loss": 4.313, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.975152492523193, | |
| "learning_rate": 0.00019330561927692345, | |
| "loss": 4.2914, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.811885356903076, | |
| "learning_rate": 0.00019323950663010733, | |
| "loss": 4.3566, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.566829204559326, | |
| "learning_rate": 0.00019317308054528966, | |
| "loss": 4.2847, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.977478504180908, | |
| "learning_rate": 0.0001931063412457713, | |
| "loss": 4.3034, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.601086616516113, | |
| "learning_rate": 0.00019303928895590596, | |
| "loss": 4.1929, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.051478385925293, | |
| "learning_rate": 0.0001929719239010996, | |
| "loss": 4.2749, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 6.248847961425781, | |
| "learning_rate": 0.00019290424630780947, | |
| "loss": 4.3419, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.392062664031982, | |
| "learning_rate": 0.0001928362564035436, | |
| "loss": 4.4038, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.6346211433410645, | |
| "learning_rate": 0.00019276795441685975, | |
| "loss": 4.3403, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.646982192993164, | |
| "learning_rate": 0.00019269934057736493, | |
| "loss": 4.252, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.455059051513672, | |
| "learning_rate": 0.00019263041511571438, | |
| "loss": 4.3809, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.478726387023926, | |
| "learning_rate": 0.00019256117826361096, | |
| "loss": 4.1885, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.029292106628418, | |
| "learning_rate": 0.0001924916302538043, | |
| "loss": 4.2615, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.6447978019714355, | |
| "learning_rate": 0.00019242177132009, | |
| "loss": 4.268, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.165138244628906, | |
| "learning_rate": 0.00019235160169730895, | |
| "loss": 4.3222, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.661884784698486, | |
| "learning_rate": 0.00019228112162134641, | |
| "loss": 4.3179, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 6.117990493774414, | |
| "learning_rate": 0.0001922103313291313, | |
| "loss": 4.2241, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 4.299765110015869, | |
| "learning_rate": 0.0001921392310586353, | |
| "loss": 4.2602, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.798460483551025, | |
| "learning_rate": 0.00019206782104887223, | |
| "loss": 4.3096, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.016506671905518, | |
| "learning_rate": 0.00019199610153989712, | |
| "loss": 4.2073, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 9.708767890930176, | |
| "learning_rate": 0.0001919240727728054, | |
| "loss": 4.2099, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 4.904361248016357, | |
| "learning_rate": 0.00019185173498973204, | |
| "loss": 4.2461, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.290199279785156, | |
| "learning_rate": 0.00019177908843385103, | |
| "loss": 4.115, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 6.290179252624512, | |
| "learning_rate": 0.00019170613334937406, | |
| "loss": 4.3295, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 5.071104526519775, | |
| "learning_rate": 0.00019163286998155027, | |
| "loss": 4.1532, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 4.5464067459106445, | |
| "learning_rate": 0.00019155929857666494, | |
| "loss": 4.0761, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 4.664229393005371, | |
| "learning_rate": 0.0001914854193820389, | |
| "loss": 4.1371, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 7.168484210968018, | |
| "learning_rate": 0.0001914112326460277, | |
| "loss": 4.178, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.570041179656982, | |
| "learning_rate": 0.0001913367386180207, | |
| "loss": 4.1536, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.298222064971924, | |
| "learning_rate": 0.00019126193754844036, | |
| "loss": 4.2089, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 7.139255523681641, | |
| "learning_rate": 0.0001911868296887411, | |
| "loss": 4.1362, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.763050556182861, | |
| "learning_rate": 0.00019111141529140887, | |
| "loss": 4.1106, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.586143493652344, | |
| "learning_rate": 0.00019103569460995998, | |
| "loss": 3.9519, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.827348232269287, | |
| "learning_rate": 0.00019095966789894038, | |
| "loss": 3.9598, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.121611595153809, | |
| "learning_rate": 0.00019088333541392478, | |
| "loss": 4.1347, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.110377788543701, | |
| "learning_rate": 0.00019080669741151581, | |
| "loss": 4.0088, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.672893047332764, | |
| "learning_rate": 0.00019072975414934318, | |
| "loss": 4.0916, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.667397499084473, | |
| "learning_rate": 0.00019065250588606262, | |
| "loss": 4.0695, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.404243469238281, | |
| "learning_rate": 0.0001905749528813553, | |
| "loss": 3.9728, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.912601470947266, | |
| "learning_rate": 0.00019049709539592686, | |
| "loss": 4.029, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.015479564666748, | |
| "learning_rate": 0.00019041893369150636, | |
| "loss": 4.0268, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 6.656422138214111, | |
| "learning_rate": 0.00019034046803084563, | |
| "loss": 4.0393, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 4.685242176055908, | |
| "learning_rate": 0.00019026169867771825, | |
| "loss": 4.1104, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.503780364990234, | |
| "learning_rate": 0.00019018262589691874, | |
| "loss": 4.0344, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 4.73757266998291, | |
| "learning_rate": 0.00019010324995426156, | |
| "loss": 4.1114, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 7.276214122772217, | |
| "learning_rate": 0.0001900235711165804, | |
| "loss": 3.8838, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.2224273681640625, | |
| "learning_rate": 0.00018994358965172717, | |
| "loss": 3.9479, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.4751996994018555, | |
| "learning_rate": 0.00018986330582857096, | |
| "loss": 4.0079, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 4.874088764190674, | |
| "learning_rate": 0.00018978271991699743, | |
| "loss": 4.1664, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 7.713326454162598, | |
| "learning_rate": 0.0001897018321879077, | |
| "loss": 3.9646, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.753252029418945, | |
| "learning_rate": 0.00018962064291321747, | |
| "loss": 3.8574, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.962434768676758, | |
| "learning_rate": 0.0001895391523658562, | |
| "loss": 3.9757, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.875513553619385, | |
| "learning_rate": 0.00018945736081976607, | |
| "loss": 4.0424, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.298293590545654, | |
| "learning_rate": 0.00018937526854990108, | |
| "loss": 3.958, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 4.98872184753418, | |
| "learning_rate": 0.00018929287583222625, | |
| "loss": 3.9225, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 6.467836380004883, | |
| "learning_rate": 0.00018921018294371645, | |
| "loss": 3.9369, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.920988082885742, | |
| "learning_rate": 0.0001891271901623558, | |
| "loss": 3.975, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.652931213378906, | |
| "learning_rate": 0.00018904389776713641, | |
| "loss": 3.9067, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.372093200683594, | |
| "learning_rate": 0.00018896030603805767, | |
| "loss": 3.9267, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 5.743618965148926, | |
| "learning_rate": 0.00018887641525612518, | |
| "loss": 3.8912, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 8.207468032836914, | |
| "learning_rate": 0.00018879222570334985, | |
| "loss": 3.9101, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.930370807647705, | |
| "learning_rate": 0.00018870773766274697, | |
| "loss": 3.8817, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.367077350616455, | |
| "learning_rate": 0.00018862295141833523, | |
| "loss": 3.8931, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.587210178375244, | |
| "learning_rate": 0.00018853786725513575, | |
| "loss": 3.9393, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.502545356750488, | |
| "learning_rate": 0.0001884524854591712, | |
| "loss": 3.8489, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.352043628692627, | |
| "learning_rate": 0.00018836680631746476, | |
| "loss": 3.8162, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.686196804046631, | |
| "learning_rate": 0.00018828083011803917, | |
| "loss": 3.9476, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.225170612335205, | |
| "learning_rate": 0.00018819455714991578, | |
| "loss": 3.9404, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 7.1347150802612305, | |
| "learning_rate": 0.0001881079877031136, | |
| "loss": 3.9798, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.343573093414307, | |
| "learning_rate": 0.0001880211220686482, | |
| "loss": 3.9038, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.858921051025391, | |
| "learning_rate": 0.00018793396053853098, | |
| "loss": 3.8792, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.721033573150635, | |
| "learning_rate": 0.0001878482554434291, | |
| "loss": 3.8421, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 6.173632621765137, | |
| "learning_rate": 0.00018776050890530516, | |
| "loss": 4.0233, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.996013164520264, | |
| "learning_rate": 0.00018767246734761796, | |
| "loss": 3.8057, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.707641124725342, | |
| "learning_rate": 0.00018758413106633186, | |
| "loss": 3.8299, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 7.221241474151611, | |
| "learning_rate": 0.00018749550035840193, | |
| "loss": 3.8828, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.554357528686523, | |
| "learning_rate": 0.00018740657552177305, | |
| "loss": 3.8553, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 4.664674282073975, | |
| "learning_rate": 0.00018731735685537885, | |
| "loss": 3.8838, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.485450267791748, | |
| "learning_rate": 0.00018722784465914071, | |
| "loss": 3.8165, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.825826644897461, | |
| "learning_rate": 0.00018713803923396668, | |
| "loss": 3.7588, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.392491817474365, | |
| "learning_rate": 0.0001870479408817507, | |
| "loss": 3.8001, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.493740081787109, | |
| "learning_rate": 0.00018695754990537123, | |
| "loss": 3.9735, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.905117511749268, | |
| "learning_rate": 0.00018686686660869062, | |
| "loss": 3.7334, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.598316192626953, | |
| "learning_rate": 0.0001867758912965537, | |
| "loss": 3.8269, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.979629039764404, | |
| "learning_rate": 0.00018668462427478714, | |
| "loss": 3.8713, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.480854511260986, | |
| "learning_rate": 0.00018659306585019813, | |
| "loss": 3.7792, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.820549488067627, | |
| "learning_rate": 0.00018650121633057346, | |
| "loss": 3.6656, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.579679012298584, | |
| "learning_rate": 0.0001864090760246785, | |
| "loss": 3.9109, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.669819355010986, | |
| "learning_rate": 0.00018631664524225615, | |
| "loss": 3.7815, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.644351005554199, | |
| "learning_rate": 0.0001862239242940257, | |
| "loss": 3.7529, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.022332191467285, | |
| "learning_rate": 0.00018613091349168205, | |
| "loss": 3.7001, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.376641273498535, | |
| "learning_rate": 0.00018603761314789425, | |
| "loss": 3.6871, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.298123359680176, | |
| "learning_rate": 0.00018594402357630495, | |
| "loss": 3.8095, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 4.590997695922852, | |
| "learning_rate": 0.00018585014509152882, | |
| "loss": 3.8069, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 6.710943222045898, | |
| "learning_rate": 0.00018575597800915198, | |
| "loss": 3.8547, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.5094499588012695, | |
| "learning_rate": 0.0001856615226457305, | |
| "loss": 3.7314, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.584799766540527, | |
| "learning_rate": 0.0001855667793187898, | |
| "loss": 3.7514, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.0391154289245605, | |
| "learning_rate": 0.00018547174834682308, | |
| "loss": 3.6231, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.92927885055542, | |
| "learning_rate": 0.00018537643004929067, | |
| "loss": 3.7008, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.359600067138672, | |
| "learning_rate": 0.00018528082474661867, | |
| "loss": 3.798, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.198579788208008, | |
| "learning_rate": 0.0001851849327601981, | |
| "loss": 3.7187, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.796758651733398, | |
| "learning_rate": 0.00018508875441238364, | |
| "loss": 3.7086, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 4.889728546142578, | |
| "learning_rate": 0.00018499229002649258, | |
| "loss": 3.7387, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.382203102111816, | |
| "learning_rate": 0.0001848955399268039, | |
| "loss": 3.5992, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.061376571655273, | |
| "learning_rate": 0.00018479850443855686, | |
| "loss": 3.6865, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.2180681228637695, | |
| "learning_rate": 0.0001847011838879503, | |
| "loss": 3.7467, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.063679218292236, | |
| "learning_rate": 0.0001846035786021412, | |
| "loss": 3.6894, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.036098480224609, | |
| "learning_rate": 0.00018450568890924373, | |
| "loss": 3.6412, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 4.86781644821167, | |
| "learning_rate": 0.00018440751513832822, | |
| "loss": 3.637, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 5.41668176651001, | |
| "learning_rate": 0.00018430905761941983, | |
| "loss": 3.6814, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 6.117024898529053, | |
| "learning_rate": 0.00018421031668349773, | |
| "loss": 3.6257, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 7.368699073791504, | |
| "learning_rate": 0.00018411129266249373, | |
| "loss": 3.7111, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 4.378394603729248, | |
| "learning_rate": 0.0001840119858892913, | |
| "loss": 3.7197, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 7.029990196228027, | |
| "learning_rate": 0.0001839123966977245, | |
| "loss": 3.7267, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 9.922813415527344, | |
| "learning_rate": 0.00018381252542257662, | |
| "loss": 3.7203, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 4.9374518394470215, | |
| "learning_rate": 0.00018371237239957932, | |
| "loss": 3.6876, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.682550430297852, | |
| "learning_rate": 0.00018361193796541142, | |
| "loss": 3.6862, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.477772235870361, | |
| "learning_rate": 0.00018351122245769771, | |
| "loss": 3.5982, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 7.745680332183838, | |
| "learning_rate": 0.00018341224888886997, | |
| "loss": 3.6978, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 6.691402912139893, | |
| "learning_rate": 0.0001833109778552932, | |
| "loss": 3.6693, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 6.229629993438721, | |
| "learning_rate": 0.00018320942675989125, | |
| "loss": 3.6327, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.655289649963379, | |
| "learning_rate": 0.0001831075959440427, | |
| "loss": 3.6032, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.4868927001953125, | |
| "learning_rate": 0.00018300548575006658, | |
| "loss": 3.7059, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 7.387706756591797, | |
| "learning_rate": 0.00018290309652122083, | |
| "loss": 3.6838, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 4.884798049926758, | |
| "learning_rate": 0.00018280042860170168, | |
| "loss": 3.665, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 7.185595512390137, | |
| "learning_rate": 0.00018269748233664204, | |
| "loss": 3.6057, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 6.449123382568359, | |
| "learning_rate": 0.0001825942580721106, | |
| "loss": 3.6262, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.469310283660889, | |
| "learning_rate": 0.00018249075615511053, | |
| "loss": 3.522, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.678877353668213, | |
| "learning_rate": 0.0001823869769335784, | |
| "loss": 3.6757, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.033955097198486, | |
| "learning_rate": 0.000182282920756383, | |
| "loss": 3.7316, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 6.790628433227539, | |
| "learning_rate": 0.00018217858797332413, | |
| "loss": 3.545, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 6.205599308013916, | |
| "learning_rate": 0.00018207397893513143, | |
| "loss": 3.6035, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 5.7604804039001465, | |
| "learning_rate": 0.00018196909399346316, | |
| "loss": 3.6869, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.528883457183838, | |
| "learning_rate": 0.0001818639335009052, | |
| "loss": 3.6493, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 6.46929407119751, | |
| "learning_rate": 0.00018175849781096966, | |
| "loss": 3.639, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.487035274505615, | |
| "learning_rate": 0.00018165278727809368, | |
| "loss": 3.5755, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 6.251669406890869, | |
| "learning_rate": 0.00018154680225763848, | |
| "loss": 3.704, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 6.204404830932617, | |
| "learning_rate": 0.00018144054310588792, | |
| "loss": 3.6071, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.7311482429504395, | |
| "learning_rate": 0.00018133401018004743, | |
| "loss": 3.5395, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 7.110382556915283, | |
| "learning_rate": 0.00018122720383824273, | |
| "loss": 3.6643, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.991401672363281, | |
| "learning_rate": 0.0001811201244395187, | |
| "loss": 3.6752, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.788415431976318, | |
| "learning_rate": 0.0001810127723438381, | |
| "loss": 3.6362, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 4.827778339385986, | |
| "learning_rate": 0.00018090514791208043, | |
| "loss": 3.7298, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.7845916748046875, | |
| "learning_rate": 0.0001807972515060407, | |
| "loss": 3.543, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.071081638336182, | |
| "learning_rate": 0.00018068908348842818, | |
| "loss": 3.5706, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.189342021942139, | |
| "learning_rate": 0.00018058064422286525, | |
| "loss": 3.667, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 7.787344455718994, | |
| "learning_rate": 0.00018047193407388603, | |
| "loss": 3.4985, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 4.411252975463867, | |
| "learning_rate": 0.00018036295340693531, | |
| "loss": 3.6719, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.700460433959961, | |
| "learning_rate": 0.00018025370258836732, | |
| "loss": 3.5075, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.121459007263184, | |
| "learning_rate": 0.00018014418198544432, | |
| "loss": 3.5511, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.29133415222168, | |
| "learning_rate": 0.0001800343919663356, | |
| "loss": 3.7063, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 6.53157377243042, | |
| "learning_rate": 0.00017992433290011604, | |
| "loss": 3.5146, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 6.442373275756836, | |
| "learning_rate": 0.00017981400515676508, | |
| "loss": 3.5431, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.247061729431152, | |
| "learning_rate": 0.00017970340910716522, | |
| "loss": 3.604, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 6.570899486541748, | |
| "learning_rate": 0.000179592545123101, | |
| "loss": 3.6034, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 6.012238502502441, | |
| "learning_rate": 0.00017948141357725764, | |
| "loss": 3.4793, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.5325422286987305, | |
| "learning_rate": 0.0001793700148432198, | |
| "loss": 3.563, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 4.889975070953369, | |
| "learning_rate": 0.00017925834929547035, | |
| "loss": 3.5512, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.225555896759033, | |
| "learning_rate": 0.00017914641730938907, | |
| "loss": 3.5521, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.430109024047852, | |
| "learning_rate": 0.0001790342192612514, | |
| "loss": 3.4549, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.8808274269104, | |
| "learning_rate": 0.00017892175552822716, | |
| "loss": 3.5518, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 6.657894134521484, | |
| "learning_rate": 0.00017880902648837946, | |
| "loss": 3.4643, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 4.968985557556152, | |
| "learning_rate": 0.00017869603252066308, | |
| "loss": 3.5022, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.369678497314453, | |
| "learning_rate": 0.00017858277400492357, | |
| "loss": 3.6906, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 6.433826446533203, | |
| "learning_rate": 0.0001784692513218956, | |
| "loss": 3.4281, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.364591121673584, | |
| "learning_rate": 0.00017835546485320202, | |
| "loss": 3.6194, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.889247894287109, | |
| "learning_rate": 0.00017824141498135244, | |
| "loss": 3.7013, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.254469394683838, | |
| "learning_rate": 0.0001781271020897419, | |
| "loss": 3.4107, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.486823558807373, | |
| "learning_rate": 0.0001780125265626495, | |
| "loss": 3.5453, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.4713311195373535, | |
| "learning_rate": 0.0001778976887852375, | |
| "loss": 3.5482, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 6.2519731521606445, | |
| "learning_rate": 0.00017778258914354946, | |
| "loss": 3.6251, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.657818794250488, | |
| "learning_rate": 0.00017766722802450944, | |
| "loss": 3.5081, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 6.176442623138428, | |
| "learning_rate": 0.0001775516058159204, | |
| "loss": 3.45, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.838647365570068, | |
| "learning_rate": 0.00017743572290646303, | |
| "loss": 3.419, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 6.912227630615234, | |
| "learning_rate": 0.00017731957968569436, | |
| "loss": 3.4892, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 7.276485919952393, | |
| "learning_rate": 0.0001772031765440465, | |
| "loss": 3.5143, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.351586818695068, | |
| "learning_rate": 0.0001770865138728254, | |
| "loss": 3.5467, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.220416069030762, | |
| "learning_rate": 0.00017696959206420937, | |
| "loss": 3.4736, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.282609462738037, | |
| "learning_rate": 0.00017685241151124781, | |
| "loss": 3.4181, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 4.958062171936035, | |
| "learning_rate": 0.00017673497260786006, | |
| "loss": 3.4309, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 6.3785929679870605, | |
| "learning_rate": 0.00017661727574883388, | |
| "loss": 3.3805, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.702798366546631, | |
| "learning_rate": 0.00017649932132982415, | |
| "loss": 3.5371, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 6.496365547180176, | |
| "learning_rate": 0.0001763811097473516, | |
| "loss": 3.4107, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.093421936035156, | |
| "learning_rate": 0.00017626264139880148, | |
| "loss": 3.5514, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.6509480476379395, | |
| "learning_rate": 0.0001761439166824221, | |
| "loss": 3.5612, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.662957191467285, | |
| "learning_rate": 0.00017602493599732372, | |
| "loss": 3.5515, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 7.548245429992676, | |
| "learning_rate": 0.000175905699743477, | |
| "loss": 3.5552, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.2797112464904785, | |
| "learning_rate": 0.00017578620832171173, | |
| "loss": 3.4159, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.431013584136963, | |
| "learning_rate": 0.0001756664621337155, | |
| "loss": 3.4257, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 8.88436222076416, | |
| "learning_rate": 0.00017554646158203236, | |
| "loss": 3.5517, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.538012981414795, | |
| "learning_rate": 0.00017542620707006136, | |
| "loss": 3.4451, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.702478408813477, | |
| "learning_rate": 0.00017530569900205538, | |
| "loss": 3.5453, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.236027240753174, | |
| "learning_rate": 0.00017518493778311957, | |
| "loss": 3.4483, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.824537754058838, | |
| "learning_rate": 0.00017506392381921014, | |
| "loss": 3.507, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.682642459869385, | |
| "learning_rate": 0.0001749426575171329, | |
| "loss": 3.4624, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.02097225189209, | |
| "learning_rate": 0.00017482113928454196, | |
| "loss": 3.4782, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.951188087463379, | |
| "learning_rate": 0.00017469936952993834, | |
| "loss": 3.5305, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.47694730758667, | |
| "learning_rate": 0.00017457734866266854, | |
| "loss": 3.4653, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.073057651519775, | |
| "learning_rate": 0.0001744575249785453, | |
| "loss": 3.4969, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.532285690307617, | |
| "learning_rate": 0.00017433500811915326, | |
| "loss": 3.3932, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.016458988189697, | |
| "learning_rate": 0.00017421224137194837, | |
| "loss": 3.4828, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 7.032898902893066, | |
| "learning_rate": 0.0001740892251496286, | |
| "loss": 3.4347, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.3446431159973145, | |
| "learning_rate": 0.00017396595986573065, | |
| "loss": 3.4101, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.952356815338135, | |
| "learning_rate": 0.00017384244593462859, | |
| "loss": 3.4296, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.403810501098633, | |
| "learning_rate": 0.00017371868377153216, | |
| "loss": 3.4264, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.429996490478516, | |
| "learning_rate": 0.00017359467379248568, | |
| "loss": 3.4341, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.618744850158691, | |
| "learning_rate": 0.00017347041641436653, | |
| "loss": 3.3357, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 6.539459705352783, | |
| "learning_rate": 0.00017334840455978504, | |
| "loss": 3.5718, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.993662357330322, | |
| "learning_rate": 0.00017322365856462736, | |
| "loss": 3.4774, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.6996002197265625, | |
| "learning_rate": 0.00017309866641761798, | |
| "loss": 3.311, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.32814884185791, | |
| "learning_rate": 0.00017297342853893604, | |
| "loss": 3.5558, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.538712501525879, | |
| "learning_rate": 0.0001728479453495866, | |
| "loss": 3.3261, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.975490093231201, | |
| "learning_rate": 0.00017272221727139946, | |
| "loss": 3.5, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.377697467803955, | |
| "learning_rate": 0.00017259624472702764, | |
| "loss": 3.4562, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 6.686251163482666, | |
| "learning_rate": 0.00017247002813994592, | |
| "loss": 3.3968, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 7.116434097290039, | |
| "learning_rate": 0.00017234356793444954, | |
| "loss": 3.3161, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.755252838134766, | |
| "learning_rate": 0.0001722168645356526, | |
| "loss": 3.4195, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 6.647252559661865, | |
| "learning_rate": 0.00017208991836948685, | |
| "loss": 3.1887, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.997719764709473, | |
| "learning_rate": 0.0001719627298627, | |
| "loss": 3.4098, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 6.054971218109131, | |
| "learning_rate": 0.00017183529944285456, | |
| "loss": 3.4159, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.706241130828857, | |
| "learning_rate": 0.00017170762753832615, | |
| "loss": 3.4024, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 7.701054096221924, | |
| "learning_rate": 0.00017157971457830226, | |
| "loss": 3.3564, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.441225528717041, | |
| "learning_rate": 0.00017145156099278067, | |
| "loss": 3.5887, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 5.631026268005371, | |
| "learning_rate": 0.0001713231672125681, | |
| "loss": 3.352, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 4.974308967590332, | |
| "learning_rate": 0.0001711945336692786, | |
| "loss": 3.3959, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 7.063317775726318, | |
| "learning_rate": 0.00017106566079533246, | |
| "loss": 3.3942, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 6.312389850616455, | |
| "learning_rate": 0.0001709365490239543, | |
| "loss": 3.3928, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.022332668304443, | |
| "learning_rate": 0.00017080719878917182, | |
| "loss": 3.4401, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 4.356366157531738, | |
| "learning_rate": 0.00017067761052581455, | |
| "loss": 3.4353, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.611413478851318, | |
| "learning_rate": 0.00017054778466951196, | |
| "loss": 3.2737, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 7.244396686553955, | |
| "learning_rate": 0.0001704177216566924, | |
| "loss": 3.2309, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 6.024662017822266, | |
| "learning_rate": 0.00017028742192458132, | |
| "loss": 3.3593, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 7.399158954620361, | |
| "learning_rate": 0.00017015688591120006, | |
| "loss": 3.2026, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 7.96980619430542, | |
| "learning_rate": 0.00017002611405536413, | |
| "loss": 3.4413, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.538659572601318, | |
| "learning_rate": 0.00016989510679668194, | |
| "loss": 3.3497, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.8960394859313965, | |
| "learning_rate": 0.00016976386457555323, | |
| "loss": 3.3708, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.486491680145264, | |
| "learning_rate": 0.00016963238783316754, | |
| "loss": 3.4697, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.217641353607178, | |
| "learning_rate": 0.0001695006770115029, | |
| "loss": 3.4249, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.6906938552856445, | |
| "learning_rate": 0.00016936873255332413, | |
| "loss": 3.5343, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 6.275619983673096, | |
| "learning_rate": 0.00016923655490218149, | |
| "loss": 3.3991, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.75913667678833, | |
| "learning_rate": 0.00016910414450240917, | |
| "loss": 3.3861, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.206583499908447, | |
| "learning_rate": 0.0001689715017991237, | |
| "loss": 3.369, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.569302082061768, | |
| "learning_rate": 0.0001688386272382227, | |
| "loss": 3.4837, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.360637187957764, | |
| "learning_rate": 0.00016870552126638298, | |
| "loss": 3.3299, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.168808937072754, | |
| "learning_rate": 0.00016857218433105945, | |
| "loss": 3.3613, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.515918731689453, | |
| "learning_rate": 0.0001684386168804834, | |
| "loss": 3.159, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.885009765625, | |
| "learning_rate": 0.000168304819363661, | |
| "loss": 3.3029, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.174962997436523, | |
| "learning_rate": 0.0001681707922303718, | |
| "loss": 3.3289, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 4.86044454574585, | |
| "learning_rate": 0.0001680365359311673, | |
| "loss": 3.3132, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 6.267008304595947, | |
| "learning_rate": 0.00016790205091736935, | |
| "loss": 3.3649, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 6.221423625946045, | |
| "learning_rate": 0.00016776733764106862, | |
| "loss": 3.3311, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.069894790649414, | |
| "learning_rate": 0.00016763239655512318, | |
| "loss": 3.3157, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 8.425812721252441, | |
| "learning_rate": 0.00016749722811315688, | |
| "loss": 3.2714, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.80504846572876, | |
| "learning_rate": 0.00016736183276955783, | |
| "loss": 3.3274, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 7.617208003997803, | |
| "learning_rate": 0.00016722621097947697, | |
| "loss": 3.1857, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 5.363246440887451, | |
| "learning_rate": 0.00016709036319882646, | |
| "loss": 3.4673, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 6.790156364440918, | |
| "learning_rate": 0.00016695428988427807, | |
| "loss": 3.3016, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 4.9824981689453125, | |
| "learning_rate": 0.00016681799149326185, | |
| "loss": 3.4103, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 4.915642738342285, | |
| "learning_rate": 0.00016668146848396442, | |
| "loss": 3.4356, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 4.938210487365723, | |
| "learning_rate": 0.0001665447213153275, | |
| "loss": 3.299, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.116371154785156, | |
| "learning_rate": 0.00016640775044704634, | |
| "loss": 3.3231, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 6.796716213226318, | |
| "learning_rate": 0.0001662705563395682, | |
| "loss": 3.3685, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 9.196764945983887, | |
| "learning_rate": 0.0001661331394540908, | |
| "loss": 3.2807, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 4.64096736907959, | |
| "learning_rate": 0.00016599550025256076, | |
| "loss": 3.2909, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.615271091461182, | |
| "learning_rate": 0.000165857639197672, | |
| "loss": 3.2044, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 6.112679481506348, | |
| "learning_rate": 0.0001657195567528643, | |
| "loss": 3.2377, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 4.882411003112793, | |
| "learning_rate": 0.0001655812533823216, | |
| "loss": 3.4462, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 4.716900825500488, | |
| "learning_rate": 0.00016544272955097063, | |
| "loss": 3.3563, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.415688991546631, | |
| "learning_rate": 0.0001653039857244791, | |
| "loss": 3.2475, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.752101421356201, | |
| "learning_rate": 0.00016516502236925434, | |
| "loss": 3.3646, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 7.952321529388428, | |
| "learning_rate": 0.00016502583995244163, | |
| "loss": 3.2835, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.233190536499023, | |
| "learning_rate": 0.00016488643894192268, | |
| "loss": 3.3653, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.358859062194824, | |
| "learning_rate": 0.00016474681980631402, | |
| "loss": 3.2425, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.584342002868652, | |
| "learning_rate": 0.0001646069830149654, | |
| "loss": 3.3139, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 6.155908584594727, | |
| "learning_rate": 0.00016446692903795837, | |
| "loss": 3.2732, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 6.454020023345947, | |
| "learning_rate": 0.00016432665834610445, | |
| "loss": 3.2121, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 5.337937831878662, | |
| "learning_rate": 0.00016418617141094374, | |
| "loss": 3.3123, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 4.727194786071777, | |
| "learning_rate": 0.00016404546870474324, | |
| "loss": 3.2558, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.887204647064209, | |
| "learning_rate": 0.00016390455070049536, | |
| "loss": 3.377, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.2247724533081055, | |
| "learning_rate": 0.0001637634178719162, | |
| "loss": 3.1277, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.014094829559326, | |
| "learning_rate": 0.00016362207069344403, | |
| "loss": 3.142, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 7.58336067199707, | |
| "learning_rate": 0.00016348050964023773, | |
| "loss": 3.3156, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 4.806646823883057, | |
| "learning_rate": 0.00016333873518817514, | |
| "loss": 3.2814, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.387387275695801, | |
| "learning_rate": 0.00016319674781385143, | |
| "loss": 3.3977, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.4912638664245605, | |
| "learning_rate": 0.00016305454799457755, | |
| "loss": 3.2323, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 4.692640781402588, | |
| "learning_rate": 0.00016291213620837867, | |
| "loss": 3.3033, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.484092712402344, | |
| "learning_rate": 0.0001627695129339924, | |
| "loss": 3.1466, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.305532932281494, | |
| "learning_rate": 0.00016262667865086746, | |
| "loss": 3.4111, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.511746883392334, | |
| "learning_rate": 0.00016248363383916182, | |
| "loss": 3.2535, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 7.3723649978637695, | |
| "learning_rate": 0.00016234037897974108, | |
| "loss": 3.3265, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.085361003875732, | |
| "learning_rate": 0.0001621997858933184, | |
| "loss": 3.3003, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 4.863938331604004, | |
| "learning_rate": 0.000162056116560834, | |
| "loss": 3.2411, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.351004123687744, | |
| "learning_rate": 0.00016191223861779529, | |
| "loss": 3.2409, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 8.316920280456543, | |
| "learning_rate": 0.0001617681525478687, | |
| "loss": 3.0881, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 7.093586444854736, | |
| "learning_rate": 0.0001616238588354203, | |
| "loss": 3.2573, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 6.5853705406188965, | |
| "learning_rate": 0.00016147935796551405, | |
| "loss": 3.3215, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 4.667483329772949, | |
| "learning_rate": 0.00016133465042391046, | |
| "loss": 3.3032, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.287956237792969, | |
| "learning_rate": 0.00016118973669706468, | |
| "loss": 3.2255, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.676483631134033, | |
| "learning_rate": 0.0001610446172721251, | |
| "loss": 3.3663, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 6.739033222198486, | |
| "learning_rate": 0.00016089929263693144, | |
| "loss": 3.2537, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.158905982971191, | |
| "learning_rate": 0.00016075376328001344, | |
| "loss": 3.2336, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.2512526512146, | |
| "learning_rate": 0.00016060802969058885, | |
| "loss": 3.2982, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 8.520125389099121, | |
| "learning_rate": 0.00016046209235856212, | |
| "loss": 3.3153, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 6.461225509643555, | |
| "learning_rate": 0.00016031595177452257, | |
| "loss": 3.2629, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 6.117869853973389, | |
| "learning_rate": 0.00016016960842974278, | |
| "loss": 3.2225, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 4.035212993621826, | |
| "learning_rate": 0.00016002306281617692, | |
| "loss": 3.3866, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.271117687225342, | |
| "learning_rate": 0.00015987631542645913, | |
| "loss": 3.2602, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 4.860154151916504, | |
| "learning_rate": 0.00015972936675390185, | |
| "loss": 3.288, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.154600143432617, | |
| "learning_rate": 0.0001595822172924942, | |
| "loss": 3.1941, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 6.897374153137207, | |
| "learning_rate": 0.00015943486753690017, | |
| "loss": 3.2323, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 6.063130855560303, | |
| "learning_rate": 0.00015928731798245721, | |
| "loss": 3.1718, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.736262321472168, | |
| "learning_rate": 0.00015913956912517432, | |
| "loss": 3.3035, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.386317253112793, | |
| "learning_rate": 0.00015899162146173053, | |
| "loss": 3.2879, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.535543441772461, | |
| "learning_rate": 0.00015884347548947314, | |
| "loss": 3.2266, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 6.057496070861816, | |
| "learning_rate": 0.00015869513170641616, | |
| "loss": 3.1668, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.6912055015563965, | |
| "learning_rate": 0.00015854659061123854, | |
| "loss": 3.1562, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.563050270080566, | |
| "learning_rate": 0.0001583978527032825, | |
| "loss": 3.1819, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 6.75504732131958, | |
| "learning_rate": 0.0001582489184825519, | |
| "loss": 3.1891, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.390425205230713, | |
| "learning_rate": 0.00015809978844971053, | |
| "loss": 3.1856, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 6.183398246765137, | |
| "learning_rate": 0.0001579504631060804, | |
| "loss": 3.3115, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.40380859375, | |
| "learning_rate": 0.00015780094295364015, | |
| "loss": 3.162, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.078804016113281, | |
| "learning_rate": 0.00015765122849502325, | |
| "loss": 3.2046, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 6.183681964874268, | |
| "learning_rate": 0.00015750132023351638, | |
| "loss": 3.0689, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 12.730826377868652, | |
| "learning_rate": 0.00015735121867305768, | |
| "loss": 3.2468, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 6.127053260803223, | |
| "learning_rate": 0.00015720092431823515, | |
| "loss": 3.1628, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.8310089111328125, | |
| "learning_rate": 0.00015705043767428483, | |
| "loss": 3.2047, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 7.505776882171631, | |
| "learning_rate": 0.0001568997592470892, | |
| "loss": 3.2827, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.399072170257568, | |
| "learning_rate": 0.00015674888954317549, | |
| "loss": 3.1483, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.224669456481934, | |
| "learning_rate": 0.00015659782906971383, | |
| "loss": 3.2698, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 7.958742618560791, | |
| "learning_rate": 0.00015644657833451577, | |
| "loss": 3.0145, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 6.20373010635376, | |
| "learning_rate": 0.0001562981685120925, | |
| "loss": 3.1598, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 5.2080159187316895, | |
| "learning_rate": 0.00015614654255930347, | |
| "loss": 3.2801, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 4.195250988006592, | |
| "learning_rate": 0.00015599472786184245, | |
| "loss": 3.156, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.0389204025268555, | |
| "learning_rate": 0.00015584272493005642, | |
| "loss": 3.1345, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 7.003210544586182, | |
| "learning_rate": 0.00015569053427492505, | |
| "loss": 3.2186, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.47674036026001, | |
| "learning_rate": 0.00015553815640805907, | |
| "loss": 3.3211, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.2981648445129395, | |
| "learning_rate": 0.00015538559184169863, | |
| "loss": 3.2454, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.528575420379639, | |
| "learning_rate": 0.00015523284108871142, | |
| "loss": 3.1963, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.553009033203125, | |
| "learning_rate": 0.0001550799046625911, | |
| "loss": 3.1682, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.608404636383057, | |
| "learning_rate": 0.0001549267830774553, | |
| "loss": 3.1461, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 6.462625503540039, | |
| "learning_rate": 0.00015477347684804445, | |
| "loss": 3.2772, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.327962875366211, | |
| "learning_rate": 0.00015461998648971928, | |
| "loss": 3.2144, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 6.94124174118042, | |
| "learning_rate": 0.00015446631251845978, | |
| "loss": 3.2227, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.351782321929932, | |
| "learning_rate": 0.00015431245545086307, | |
| "loss": 3.2687, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 4.562844276428223, | |
| "learning_rate": 0.00015415841580414185, | |
| "loss": 3.1332, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.023700714111328, | |
| "learning_rate": 0.00015400419409612243, | |
| "loss": 3.2272, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.127398490905762, | |
| "learning_rate": 0.0001538497908452433, | |
| "loss": 3.2843, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.577905178070068, | |
| "learning_rate": 0.0001536952065705532, | |
| "loss": 3.2635, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 6.119299411773682, | |
| "learning_rate": 0.00015354044179170933, | |
| "loss": 3.126, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 4.827983856201172, | |
| "learning_rate": 0.0001533854970289758, | |
| "loss": 3.2345, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 4.499656677246094, | |
| "learning_rate": 0.00015323037280322166, | |
| "loss": 3.0808, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.260239601135254, | |
| "learning_rate": 0.00015307506963591923, | |
| "loss": 3.1234, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.486075401306152, | |
| "learning_rate": 0.00015291958804914256, | |
| "loss": 3.1769, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 4.572110176086426, | |
| "learning_rate": 0.00015276392856556527, | |
| "loss": 3.2166, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.288125991821289, | |
| "learning_rate": 0.0001526080917084591, | |
| "loss": 3.0781, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.380829334259033, | |
| "learning_rate": 0.000152452078001692, | |
| "loss": 3.1178, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.039462566375732, | |
| "learning_rate": 0.00015229588796972652, | |
| "loss": 3.2808, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 7.323626518249512, | |
| "learning_rate": 0.00015213952213761787, | |
| "loss": 3.1391, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.912395000457764, | |
| "learning_rate": 0.00015198298103101228, | |
| "loss": 3.1744, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.694441795349121, | |
| "learning_rate": 0.00015182626517614518, | |
| "loss": 3.0576, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.467188358306885, | |
| "learning_rate": 0.00015166937509983943, | |
| "loss": 3.2361, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.292226791381836, | |
| "learning_rate": 0.00015151231132950357, | |
| "loss": 3.1376, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.369929313659668, | |
| "learning_rate": 0.00015135507439313005, | |
| "loss": 3.1406, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.252573013305664, | |
| "learning_rate": 0.00015119766481929342, | |
| "loss": 3.123, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.9053730964660645, | |
| "learning_rate": 0.00015104008313714858, | |
| "loss": 3.0018, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 6.080839157104492, | |
| "learning_rate": 0.00015088232987642898, | |
| "loss": 3.2106, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.92653226852417, | |
| "learning_rate": 0.00015072440556744492, | |
| "loss": 3.2095, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 5.627429008483887, | |
| "learning_rate": 0.00015056631074108166, | |
| "loss": 3.089, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.694194793701172, | |
| "learning_rate": 0.00015040804592879762, | |
| "loss": 3.0885, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.04107666015625, | |
| "learning_rate": 0.00015024961166262276, | |
| "loss": 3.0906, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.280002117156982, | |
| "learning_rate": 0.0001500910084751567, | |
| "loss": 3.2142, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 4.803068161010742, | |
| "learning_rate": 0.00014993223689956672, | |
| "loss": 3.2014, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.611780643463135, | |
| "learning_rate": 0.00014977329746958636, | |
| "loss": 3.2491, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.6669020652771, | |
| "learning_rate": 0.00014961737449079314, | |
| "loss": 3.2, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.894138336181641, | |
| "learning_rate": 0.00014945810428594703, | |
| "loss": 3.0321, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 6.354518413543701, | |
| "learning_rate": 0.0001492986678205755, | |
| "loss": 3.1314, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 4.786489009857178, | |
| "learning_rate": 0.00014913906563064706, | |
| "loss": 3.1937, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 8.759417533874512, | |
| "learning_rate": 0.00014897929825268745, | |
| "loss": 3.1069, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.354910850524902, | |
| "learning_rate": 0.00014881936622377766, | |
| "loss": 3.1519, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.169478416442871, | |
| "learning_rate": 0.0001486592700815522, | |
| "loss": 3.1414, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 6.09418249130249, | |
| "learning_rate": 0.00014849901036419723, | |
| "loss": 3.0954, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 4.986037731170654, | |
| "learning_rate": 0.00014833858761044883, | |
| "loss": 3.2445, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 12.803654670715332, | |
| "learning_rate": 0.00014817800235959118, | |
| "loss": 3.0699, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 6.191990852355957, | |
| "learning_rate": 0.00014801725515145467, | |
| "loss": 3.2574, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 5.233778476715088, | |
| "learning_rate": 0.00014785634652641412, | |
| "loss": 3.1152, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 6.065474987030029, | |
| "learning_rate": 0.000147695277025387, | |
| "loss": 3.1178, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 5.4664435386657715, | |
| "learning_rate": 0.00014753404718983158, | |
| "loss": 3.0627, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 4.926270484924316, | |
| "learning_rate": 0.00014737265756174515, | |
| "loss": 3.0182, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 6.013931751251221, | |
| "learning_rate": 0.0001472111086836621, | |
| "loss": 3.0801, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 6.25607442855835, | |
| "learning_rate": 0.00014704940109865224, | |
| "loss": 3.1227, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 5.084470272064209, | |
| "learning_rate": 0.00014688753535031882, | |
| "loss": 3.1786, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 5.16444730758667, | |
| "learning_rate": 0.00014672551198279687, | |
| "loss": 3.0651, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 5.316379070281982, | |
| "learning_rate": 0.00014656333154075118, | |
| "loss": 2.9925, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 5.331335067749023, | |
| "learning_rate": 0.00014640099456937462, | |
| "loss": 3.097, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 4.452579498291016, | |
| "learning_rate": 0.00014623850161438626, | |
| "loss": 3.1222, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 5.550851345062256, | |
| "learning_rate": 0.00014607585322202953, | |
| "loss": 3.2343, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 6.5701375007629395, | |
| "learning_rate": 0.00014591304993907033, | |
| "loss": 3.1558, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 5.140341758728027, | |
| "learning_rate": 0.00014575009231279534, | |
| "loss": 3.0036, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 4.767757415771484, | |
| "learning_rate": 0.00014558698089101003, | |
| "loss": 3.1355, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 6.189707279205322, | |
| "learning_rate": 0.00014542371622203689, | |
| "loss": 3.0721, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 4.492117881774902, | |
| "learning_rate": 0.00014526029885471355, | |
| "loss": 3.1083, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 5.021632671356201, | |
| "learning_rate": 0.000145096729338391, | |
| "loss": 3.1291, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 5.501243591308594, | |
| "learning_rate": 0.00014493300822293164, | |
| "loss": 3.0654, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 5.056375026702881, | |
| "learning_rate": 0.0001447691360587076, | |
| "loss": 3.054, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 5.866014003753662, | |
| "learning_rate": 0.0001446051133965986, | |
| "loss": 3.1111, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 5.377662658691406, | |
| "learning_rate": 0.0001444409407879905, | |
| "loss": 3.2083, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 5.692511558532715, | |
| "learning_rate": 0.00014427661878477305, | |
| "loss": 3.1613, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 5.763862609863281, | |
| "learning_rate": 0.0001441121479393383, | |
| "loss": 3.1422, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 6.247575283050537, | |
| "learning_rate": 0.00014394752880457867, | |
| "loss": 3.1826, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 6.18945837020874, | |
| "learning_rate": 0.00014378276193388498, | |
| "loss": 3.0887, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 6.006703853607178, | |
| "learning_rate": 0.00014361784788114476, | |
| "loss": 3.0888, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 5.308047771453857, | |
| "learning_rate": 0.00014345278720074032, | |
| "loss": 3.1604, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 4.780633926391602, | |
| "learning_rate": 0.00014329088601053276, | |
| "loss": 3.0979, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 5.292586803436279, | |
| "learning_rate": 0.0001431255366448192, | |
| "loss": 3.1392, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 15.803680419921875, | |
| "learning_rate": 0.0001429600423064164, | |
| "loss": 3.1114, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 6.428526401519775, | |
| "learning_rate": 0.00014279440355165733, | |
| "loss": 2.9971, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 6.179375171661377, | |
| "learning_rate": 0.0001426286209373605, | |
| "loss": 3.0888, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 5.588586330413818, | |
| "learning_rate": 0.000142462695020828, | |
| "loss": 3.0704, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 4.596306324005127, | |
| "learning_rate": 0.00014229662635984358, | |
| "loss": 3.1505, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 5.788643836975098, | |
| "learning_rate": 0.00014213041551267098, | |
| "loss": 2.9664, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 6.529501438140869, | |
| "learning_rate": 0.00014196406303805181, | |
| "loss": 3.0943, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 6.052688121795654, | |
| "learning_rate": 0.00014179756949520385, | |
| "loss": 3.1545, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 4.716464042663574, | |
| "learning_rate": 0.00014163093544381904, | |
| "loss": 3.1625, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.132946014404297, | |
| "learning_rate": 0.0001414641614440617, | |
| "loss": 3.0922, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.718029975891113, | |
| "learning_rate": 0.00014129724805656666, | |
| "loss": 3.0912, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.161171913146973, | |
| "learning_rate": 0.00014113019584243716, | |
| "loss": 3.1888, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.421058177947998, | |
| "learning_rate": 0.0001409630053632433, | |
| "loss": 2.9738, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.962222576141357, | |
| "learning_rate": 0.00014079567718101987, | |
| "loss": 3.0353, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.4867753982543945, | |
| "learning_rate": 0.0001406282118582646, | |
| "loss": 3.0904, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 7.088072299957275, | |
| "learning_rate": 0.00014046060995793626, | |
| "loss": 3.0246, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.093292236328125, | |
| "learning_rate": 0.0001402928720434527, | |
| "loss": 2.9939, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.068340301513672, | |
| "learning_rate": 0.00014012499867868905, | |
| "loss": 3.118, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.125149726867676, | |
| "learning_rate": 0.00013995699042797574, | |
| "loss": 3.0984, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.709127902984619, | |
| "learning_rate": 0.00013978884785609668, | |
| "loss": 3.0354, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 4.788537979125977, | |
| "learning_rate": 0.00013962057152828726, | |
| "loss": 3.0648, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 6.115299701690674, | |
| "learning_rate": 0.00013945216201023252, | |
| "loss": 3.191, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 8.774588584899902, | |
| "learning_rate": 0.0001392836198680653, | |
| "loss": 3.2434, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.179059982299805, | |
| "learning_rate": 0.00013911494566836417, | |
| "loss": 3.0184, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.080101490020752, | |
| "learning_rate": 0.00013894613997815174, | |
| "loss": 3.0847, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.972216606140137, | |
| "learning_rate": 0.00013877720336489264, | |
| "loss": 3.0745, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 4.853601455688477, | |
| "learning_rate": 0.0001386081363964915, | |
| "loss": 3.0191, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 9.077406883239746, | |
| "learning_rate": 0.00013843893964129127, | |
| "loss": 3.0575, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 5.7632904052734375, | |
| "learning_rate": 0.00013826961366807114, | |
| "loss": 3.1308, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 7.178746700286865, | |
| "learning_rate": 0.00013810015904604472, | |
| "loss": 3.0838, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 5.9257402420043945, | |
| "learning_rate": 0.0001379305763448581, | |
| "loss": 3.1233, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 5.543473720550537, | |
| "learning_rate": 0.00013776086613458783, | |
| "loss": 3.0783, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 6.271590709686279, | |
| "learning_rate": 0.00013759102898573924, | |
| "loss": 2.9985, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 6.419349193572998, | |
| "learning_rate": 0.00013742106546924427, | |
| "loss": 3.093, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 6.282319068908691, | |
| "learning_rate": 0.00013725097615645973, | |
| "loss": 3.0827, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 5.733994960784912, | |
| "learning_rate": 0.00013708076161916529, | |
| "loss": 2.9967, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 5.090719699859619, | |
| "learning_rate": 0.00013691042242956156, | |
| "loss": 3.1321, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 4.850597381591797, | |
| "learning_rate": 0.00013673995916026823, | |
| "loss": 3.0506, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 5.414620876312256, | |
| "learning_rate": 0.00013656937238432207, | |
| "loss": 2.9907, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 5.465592384338379, | |
| "learning_rate": 0.00013639866267517502, | |
| "loss": 3.0623, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 5.292060375213623, | |
| "learning_rate": 0.00013622783060669236, | |
| "loss": 3.1343, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 5.4502644538879395, | |
| "learning_rate": 0.00013605687675315056, | |
| "loss": 3.0725, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 4.803048133850098, | |
| "learning_rate": 0.00013588580168923564, | |
| "loss": 3.0884, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 5.469101428985596, | |
| "learning_rate": 0.00013571460599004097, | |
| "loss": 3.0724, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 4.901301383972168, | |
| "learning_rate": 0.00013554329023106549, | |
| "loss": 3.1341, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 6.607944011688232, | |
| "learning_rate": 0.00013537185498821178, | |
| "loss": 3.003, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 5.942598342895508, | |
| "learning_rate": 0.00013520030083778403, | |
| "loss": 3.1121, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 4.80432653427124, | |
| "learning_rate": 0.00013502862835648618, | |
| "loss": 3.0313, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 5.739936828613281, | |
| "learning_rate": 0.00013485683812141993, | |
| "loss": 3.1094, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 5.018519401550293, | |
| "learning_rate": 0.00013468493071008286, | |
| "loss": 3.0696, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 6.4165825843811035, | |
| "learning_rate": 0.00013451290670036641, | |
| "loss": 3.1152, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 4.517113208770752, | |
| "learning_rate": 0.00013434076667055407, | |
| "loss": 2.886, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 7.886168956756592, | |
| "learning_rate": 0.00013416851119931922, | |
| "loss": 3.1064, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 6.000852108001709, | |
| "learning_rate": 0.0001339961408657235, | |
| "loss": 2.9484, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 5.562610626220703, | |
| "learning_rate": 0.00013382365624921438, | |
| "loss": 3.0847, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 5.29053258895874, | |
| "learning_rate": 0.00013365105792962384, | |
| "loss": 3.0938, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 6.465425491333008, | |
| "learning_rate": 0.0001334783464871658, | |
| "loss": 3.1012, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 5.68001651763916, | |
| "learning_rate": 0.00013330552250243475, | |
| "loss": 3.0371, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 6.374165058135986, | |
| "learning_rate": 0.00013313258655640325, | |
| "loss": 3.141, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 6.1900410652160645, | |
| "learning_rate": 0.00013295953923042034, | |
| "loss": 2.9999, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 5.627082824707031, | |
| "learning_rate": 0.00013278638110620952, | |
| "loss": 2.9751, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 6.941247463226318, | |
| "learning_rate": 0.0001326131127658667, | |
| "loss": 3.0579, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 5.677605152130127, | |
| "learning_rate": 0.00013243973479185828, | |
| "loss": 3.0327, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 5.137176513671875, | |
| "learning_rate": 0.00013226624776701925, | |
| "loss": 3.0782, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 7.087386131286621, | |
| "learning_rate": 0.00013209265227455118, | |
| "loss": 3.0633, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 4.575850486755371, | |
| "learning_rate": 0.0001319189488980203, | |
| "loss": 3.1038, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 4.31092643737793, | |
| "learning_rate": 0.0001317451382213554, | |
| "loss": 3.0153, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 5.346449375152588, | |
| "learning_rate": 0.0001315712208288461, | |
| "loss": 2.998, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 5.6449198722839355, | |
| "learning_rate": 0.0001313971973051407, | |
| "loss": 3.0801, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 5.223085403442383, | |
| "learning_rate": 0.00013122306823524425, | |
| "loss": 2.9869, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 5.5236053466796875, | |
| "learning_rate": 0.00013104883420451673, | |
| "loss": 2.865, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 6.014589786529541, | |
| "learning_rate": 0.00013087449579867082, | |
| "loss": 2.9073, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 4.373612403869629, | |
| "learning_rate": 0.0001307000536037701, | |
| "loss": 2.982, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 4.678324222564697, | |
| "learning_rate": 0.00013052550820622712, | |
| "loss": 3.0649, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 5.631078720092773, | |
| "learning_rate": 0.0001303508601928013, | |
| "loss": 3.0235, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 5.745089054107666, | |
| "learning_rate": 0.00013017611015059703, | |
| "loss": 3.017, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 4.9316086769104, | |
| "learning_rate": 0.00013000125866706173, | |
| "loss": 2.8922, | |
| "step": 39050 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 4.83328104019165, | |
| "learning_rate": 0.00012982630632998375, | |
| "loss": 2.9944, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 4.858354568481445, | |
| "learning_rate": 0.00012965125372749048, | |
| "loss": 3.2066, | |
| "step": 39150 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 4.85454797744751, | |
| "learning_rate": 0.00012947610144804653, | |
| "loss": 3.0025, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 4.420687675476074, | |
| "learning_rate": 0.00012930085008045134, | |
| "loss": 3.0974, | |
| "step": 39250 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 5.3282012939453125, | |
| "learning_rate": 0.0001291255002138376, | |
| "loss": 2.9854, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 5.671788692474365, | |
| "learning_rate": 0.0001289500524376691, | |
| "loss": 2.9885, | |
| "step": 39350 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 4.758500099182129, | |
| "learning_rate": 0.00012877450734173875, | |
| "loss": 3.0249, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 5.179042339324951, | |
| "learning_rate": 0.00012859886551616664, | |
| "loss": 2.9665, | |
| "step": 39450 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 4.027754306793213, | |
| "learning_rate": 0.00012842312755139795, | |
| "loss": 2.9796, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 4.407317638397217, | |
| "learning_rate": 0.00012824729403820118, | |
| "loss": 3.0149, | |
| "step": 39550 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 4.677199840545654, | |
| "learning_rate": 0.00012807136556766587, | |
| "loss": 3.0432, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 4.903664588928223, | |
| "learning_rate": 0.00012789534273120094, | |
| "loss": 2.9611, | |
| "step": 39650 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 4.799653053283691, | |
| "learning_rate": 0.0001277192261205324, | |
| "loss": 3.0132, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 5.403965473175049, | |
| "learning_rate": 0.00012754301632770157, | |
| "loss": 2.9755, | |
| "step": 39750 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 15.81489086151123, | |
| "learning_rate": 0.000127366713945063, | |
| "loss": 2.9749, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 5.791144847869873, | |
| "learning_rate": 0.00012719031956528247, | |
| "loss": 2.9958, | |
| "step": 39850 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 6.054769992828369, | |
| "learning_rate": 0.000127013833781335, | |
| "loss": 3.0009, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 4.867542743682861, | |
| "learning_rate": 0.00012683725718650298, | |
| "loss": 3.0065, | |
| "step": 39950 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 6.006402015686035, | |
| "learning_rate": 0.00012666059037437402, | |
| "loss": 2.9425, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 4.8633551597595215, | |
| "learning_rate": 0.00012648383393883893, | |
| "loss": 2.9857, | |
| "step": 40050 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 4.701941013336182, | |
| "learning_rate": 0.0001263069884740899, | |
| "loss": 2.9212, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 4.5726237297058105, | |
| "learning_rate": 0.0001261300545746184, | |
| "loss": 2.9625, | |
| "step": 40150 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 7.131462574005127, | |
| "learning_rate": 0.00012595303283521316, | |
| "loss": 3.0548, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 5.9713826179504395, | |
| "learning_rate": 0.00012577592385095814, | |
| "loss": 2.9929, | |
| "step": 40250 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 4.615350723266602, | |
| "learning_rate": 0.0001255987282172307, | |
| "loss": 3.0768, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 5.178917407989502, | |
| "learning_rate": 0.00012542144652969944, | |
| "loss": 3.0278, | |
| "step": 40350 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 6.033947944641113, | |
| "learning_rate": 0.0001252440793843222, | |
| "loss": 2.984, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 4.493157863616943, | |
| "learning_rate": 0.00012506662737734413, | |
| "loss": 3.0221, | |
| "step": 40450 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 5.638075351715088, | |
| "learning_rate": 0.00012488909110529563, | |
| "loss": 3.0525, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 6.92527437210083, | |
| "learning_rate": 0.00012471147116499044, | |
| "loss": 2.9417, | |
| "step": 40550 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 5.838274955749512, | |
| "learning_rate": 0.0001245337681535235, | |
| "loss": 2.9705, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 5.925703525543213, | |
| "learning_rate": 0.00012435598266826906, | |
| "loss": 2.9178, | |
| "step": 40650 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 5.92296028137207, | |
| "learning_rate": 0.0001241781153068785, | |
| "loss": 3.0117, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 5.870546817779541, | |
| "learning_rate": 0.00012400016666727856, | |
| "loss": 2.9036, | |
| "step": 40750 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 4.947800159454346, | |
| "learning_rate": 0.00012382213734766918, | |
| "loss": 2.9763, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 5.460611820220947, | |
| "learning_rate": 0.00012364402794652146, | |
| "loss": 2.9834, | |
| "step": 40850 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 4.415246486663818, | |
| "learning_rate": 0.00012346583906257578, | |
| "loss": 2.9728, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 4.871602535247803, | |
| "learning_rate": 0.00012328757129483967, | |
| "loss": 3.1404, | |
| "step": 40950 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 5.851921558380127, | |
| "learning_rate": 0.00012310922524258588, | |
| "loss": 2.9854, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 5.699054718017578, | |
| "learning_rate": 0.00012293080150535025, | |
| "loss": 3.0183, | |
| "step": 41050 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 5.255423069000244, | |
| "learning_rate": 0.00012275230068292987, | |
| "loss": 3.0225, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 6.410534858703613, | |
| "learning_rate": 0.00012257372337538087, | |
| "loss": 2.9212, | |
| "step": 41150 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 5.757706642150879, | |
| "learning_rate": 0.0001223950701830165, | |
| "loss": 2.9113, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 5.773000717163086, | |
| "learning_rate": 0.00012221634170640525, | |
| "loss": 3.0744, | |
| "step": 41250 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 5.893006801605225, | |
| "learning_rate": 0.00012203753854636848, | |
| "loss": 2.9178, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 4.773557186126709, | |
| "learning_rate": 0.00012185866130397874, | |
| "loss": 3.0172, | |
| "step": 41350 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 5.521202087402344, | |
| "learning_rate": 0.00012167971058055758, | |
| "loss": 3.0125, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 4.666463375091553, | |
| "learning_rate": 0.00012150068697767361, | |
| "loss": 3.01, | |
| "step": 41450 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 6.500729084014893, | |
| "learning_rate": 0.00012132159109714036, | |
| "loss": 2.9112, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 4.394867897033691, | |
| "learning_rate": 0.0001211424235410144, | |
| "loss": 2.9984, | |
| "step": 41550 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 5.369923114776611, | |
| "learning_rate": 0.00012096318491159317, | |
| "loss": 3.0215, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 4.682898998260498, | |
| "learning_rate": 0.00012078387581141315, | |
| "loss": 3.0607, | |
| "step": 41650 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 6.522764205932617, | |
| "learning_rate": 0.00012060449684324761, | |
| "loss": 2.8554, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 4.820666790008545, | |
| "learning_rate": 0.00012042504861010472, | |
| "loss": 2.9452, | |
| "step": 41750 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 4.502612113952637, | |
| "learning_rate": 0.00012024553171522557, | |
| "loss": 2.8537, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 6.1411919593811035, | |
| "learning_rate": 0.00012006594676208191, | |
| "loss": 2.8307, | |
| "step": 41850 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 6.050970554351807, | |
| "learning_rate": 0.00011988629435437444, | |
| "loss": 3.0305, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 5.647627830505371, | |
| "learning_rate": 0.0001197065750960305, | |
| "loss": 2.8381, | |
| "step": 41950 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 6.286309719085693, | |
| "learning_rate": 0.00011952678959120224, | |
| "loss": 2.9592, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 5.323746204376221, | |
| "learning_rate": 0.00011935053610658573, | |
| "loss": 2.9537, | |
| "step": 42050 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 5.923861503601074, | |
| "learning_rate": 0.00011917062121695774, | |
| "loss": 2.9324, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 5.180017948150635, | |
| "learning_rate": 0.00011899064188253139, | |
| "loss": 3.0805, | |
| "step": 42150 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 5.270453929901123, | |
| "learning_rate": 0.00011881059870833311, | |
| "loss": 2.9936, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 4.614379405975342, | |
| "learning_rate": 0.0001186304922996039, | |
| "loss": 3.0418, | |
| "step": 42250 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 5.335697174072266, | |
| "learning_rate": 0.00011845032326179733, | |
| "loss": 2.8929, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 4.8122968673706055, | |
| "learning_rate": 0.00011827009220057747, | |
| "loss": 2.9899, | |
| "step": 42350 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 5.42422342300415, | |
| "learning_rate": 0.00011808979972181702, | |
| "loss": 2.9105, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 5.390098571777344, | |
| "learning_rate": 0.00011790944643159498, | |
| "loss": 2.9243, | |
| "step": 42450 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 4.957045078277588, | |
| "learning_rate": 0.0001177290329361949, | |
| "loss": 2.8823, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 4.895153522491455, | |
| "learning_rate": 0.00011754855984210267, | |
| "loss": 2.9154, | |
| "step": 42550 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 5.1375651359558105, | |
| "learning_rate": 0.00011736802775600453, | |
| "loss": 2.8391, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 5.678954601287842, | |
| "learning_rate": 0.000117187437284785, | |
| "loss": 3.0804, | |
| "step": 42650 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 5.407025337219238, | |
| "learning_rate": 0.00011700678903552496, | |
| "loss": 2.8453, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 4.979137420654297, | |
| "learning_rate": 0.00011682608361549941, | |
| "loss": 2.9804, | |
| "step": 42750 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 6.545903205871582, | |
| "learning_rate": 0.00011664532163217561, | |
| "loss": 2.9162, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 4.6958513259887695, | |
| "learning_rate": 0.00011646450369321096, | |
| "loss": 2.9211, | |
| "step": 42850 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 5.4765777587890625, | |
| "learning_rate": 0.00011628363040645092, | |
| "loss": 3.0789, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 4.468578338623047, | |
| "learning_rate": 0.00011610270237992707, | |
| "loss": 2.8729, | |
| "step": 42950 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 5.818888187408447, | |
| "learning_rate": 0.00011592172022185495, | |
| "loss": 3.0, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 4.613924503326416, | |
| "learning_rate": 0.00011574068454063209, | |
| "loss": 2.9263, | |
| "step": 43050 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 5.543806076049805, | |
| "learning_rate": 0.00011555959594483597, | |
| "loss": 3.0417, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 6.07559871673584, | |
| "learning_rate": 0.00011537845504322191, | |
| "loss": 2.8919, | |
| "step": 43150 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 5.199263572692871, | |
| "learning_rate": 0.00011519726244472111, | |
| "loss": 2.9026, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 5.044497966766357, | |
| "learning_rate": 0.0001150160187584385, | |
| "loss": 2.9563, | |
| "step": 43250 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 4.860003471374512, | |
| "learning_rate": 0.00011483472459365079, | |
| "loss": 3.041, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 5.622586727142334, | |
| "learning_rate": 0.00011465338055980439, | |
| "loss": 2.7987, | |
| "step": 43350 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 6.760509490966797, | |
| "learning_rate": 0.00011447198726651332, | |
| "loss": 2.9494, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 8.510196685791016, | |
| "learning_rate": 0.00011429054532355719, | |
| "loss": 2.9445, | |
| "step": 43450 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 5.452268123626709, | |
| "learning_rate": 0.00011410905534087917, | |
| "loss": 2.8942, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 6.183492183685303, | |
| "learning_rate": 0.00011392751792858391, | |
| "loss": 3.044, | |
| "step": 43550 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 6.01775598526001, | |
| "learning_rate": 0.00011374593369693555, | |
| "loss": 2.9628, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 4.965917110443115, | |
| "learning_rate": 0.00011356430325635556, | |
| "loss": 2.8986, | |
| "step": 43650 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 4.815042018890381, | |
| "learning_rate": 0.00011338262721742076, | |
| "loss": 2.8902, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 6.838842391967773, | |
| "learning_rate": 0.0001132009061908613, | |
| "loss": 2.8702, | |
| "step": 43750 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 4.264350891113281, | |
| "learning_rate": 0.00011301914078755846, | |
| "loss": 2.9841, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 5.292375564575195, | |
| "learning_rate": 0.00011283733161854284, | |
| "loss": 2.9599, | |
| "step": 43850 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 6.021354675292969, | |
| "learning_rate": 0.00011265547929499205, | |
| "loss": 2.9701, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 5.289261817932129, | |
| "learning_rate": 0.0001124735844282288, | |
| "loss": 2.9529, | |
| "step": 43950 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 5.365589141845703, | |
| "learning_rate": 0.00011229164762971884, | |
| "loss": 2.8719, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 5.0670599937438965, | |
| "learning_rate": 0.00011210966951106889, | |
| "loss": 2.901, | |
| "step": 44050 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 5.8635573387146, | |
| "learning_rate": 0.00011193129145555024, | |
| "loss": 2.8916, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 5.172703266143799, | |
| "learning_rate": 0.00011174923332792682, | |
| "loss": 2.9745, | |
| "step": 44150 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 5.451959609985352, | |
| "learning_rate": 0.000111567135703567, | |
| "loss": 2.9827, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 4.874610900878906, | |
| "learning_rate": 0.00011138499919461816, | |
| "loss": 2.9749, | |
| "step": 44250 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 4.621652603149414, | |
| "learning_rate": 0.0001112028244133583, | |
| "loss": 2.8464, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 5.568783760070801, | |
| "learning_rate": 0.00011102061197219413, | |
| "loss": 2.923, | |
| "step": 44350 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 6.537819862365723, | |
| "learning_rate": 0.00011083836248365901, | |
| "loss": 2.9042, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 4.626901626586914, | |
| "learning_rate": 0.00011065607656041075, | |
| "loss": 2.9843, | |
| "step": 44450 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 5.579078197479248, | |
| "learning_rate": 0.00011047375481522969, | |
| "loss": 2.9371, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 5.379307746887207, | |
| "learning_rate": 0.00011029139786101657, | |
| "loss": 2.8835, | |
| "step": 44550 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 5.236015796661377, | |
| "learning_rate": 0.00011010900631079049, | |
| "loss": 2.8995, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.900421619415283, | |
| "learning_rate": 0.00010992658077768689, | |
| "loss": 2.9544, | |
| "step": 44650 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.1276445388793945, | |
| "learning_rate": 0.0001097441218749554, | |
| "loss": 2.9357, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 5.315830707550049, | |
| "learning_rate": 0.00010956163021595782, | |
| "loss": 2.9812, | |
| "step": 44750 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.422897815704346, | |
| "learning_rate": 0.00010937910641416613, | |
| "loss": 2.7807, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 6.341892242431641, | |
| "learning_rate": 0.0001091965510831603, | |
| "loss": 3.0267, | |
| "step": 44850 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.295689105987549, | |
| "learning_rate": 0.00010901396483662629, | |
| "loss": 2.9204, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 5.320792198181152, | |
| "learning_rate": 0.00010883134828835406, | |
| "loss": 2.8956, | |
| "step": 44950 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 5.325866222381592, | |
| "learning_rate": 0.00010864870205223534, | |
| "loss": 2.8959, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 7.466726303100586, | |
| "learning_rate": 0.00010846602674226174, | |
| "loss": 3.0028, | |
| "step": 45050 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 5.151381015777588, | |
| "learning_rate": 0.0001082833229725225, | |
| "loss": 2.9644, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.178279399871826, | |
| "learning_rate": 0.00010810059135720268, | |
| "loss": 3.0683, | |
| "step": 45150 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 6.167341709136963, | |
| "learning_rate": 0.00010791783251058083, | |
| "loss": 2.8418, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.541141033172607, | |
| "learning_rate": 0.00010773504704702707, | |
| "loss": 2.9832, | |
| "step": 45250 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 5.082434177398682, | |
| "learning_rate": 0.00010755223558100099, | |
| "loss": 2.9448, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.632344722747803, | |
| "learning_rate": 0.00010736939872704965, | |
| "loss": 2.8956, | |
| "step": 45350 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.984410762786865, | |
| "learning_rate": 0.00010718653709980537, | |
| "loss": 2.7704, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 4.388175964355469, | |
| "learning_rate": 0.00010700365131398384, | |
| "loss": 2.9454, | |
| "step": 45450 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 5.920036792755127, | |
| "learning_rate": 0.00010682074198438188, | |
| "loss": 2.7829, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 7.379148006439209, | |
| "learning_rate": 0.00010663780972587546, | |
| "loss": 2.8758, | |
| "step": 45550 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 4.970609664916992, | |
| "learning_rate": 0.00010645485515341773, | |
| "loss": 2.8542, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 5.384859561920166, | |
| "learning_rate": 0.00010627187888203671, | |
| "loss": 3.0344, | |
| "step": 45650 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 6.491629123687744, | |
| "learning_rate": 0.00010608888152683345, | |
| "loss": 2.929, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 4.58522367477417, | |
| "learning_rate": 0.00010590586370297987, | |
| "loss": 2.7649, | |
| "step": 45750 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 5.452960014343262, | |
| "learning_rate": 0.0001057228260257167, | |
| "loss": 2.8287, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 5.445278167724609, | |
| "learning_rate": 0.0001055397691103514, | |
| "loss": 2.9186, | |
| "step": 45850 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 4.806256294250488, | |
| "learning_rate": 0.00010535669357225606, | |
| "loss": 2.8372, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 8.205987930297852, | |
| "learning_rate": 0.00010517360002686542, | |
| "loss": 2.8849, | |
| "step": 45950 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 5.005978107452393, | |
| "learning_rate": 0.00010499048908967479, | |
| "loss": 2.9383, | |
| "step": 46000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 90183, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "total_flos": 1.6541164240896e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |