{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.988610478359909, "eval_steps": 500, "global_step": 657, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04555808656036447, "grad_norm": 4.052004444605685, "learning_rate": 1.5151515151515152e-06, "loss": 0.5253, "step": 10 }, { "epoch": 0.09111617312072894, "grad_norm": 1.4381960454873794, "learning_rate": 3.0303030303030305e-06, "loss": 0.4498, "step": 20 }, { "epoch": 0.1366742596810934, "grad_norm": 1.3543508392140766, "learning_rate": 4.5454545454545455e-06, "loss": 0.4, "step": 30 }, { "epoch": 0.18223234624145787, "grad_norm": 1.2520872546371757, "learning_rate": 6.060606060606061e-06, "loss": 0.3763, "step": 40 }, { "epoch": 0.22779043280182232, "grad_norm": 1.02523284296708, "learning_rate": 7.5757575757575764e-06, "loss": 0.3626, "step": 50 }, { "epoch": 0.2733485193621868, "grad_norm": 1.093558631219529, "learning_rate": 9.090909090909091e-06, "loss": 0.3496, "step": 60 }, { "epoch": 0.31890660592255127, "grad_norm": 1.1794828114399931, "learning_rate": 9.998869765883566e-06, "loss": 0.3328, "step": 70 }, { "epoch": 0.36446469248291574, "grad_norm": 0.9895352612973752, "learning_rate": 9.986160499534318e-06, "loss": 0.3323, "step": 80 }, { "epoch": 0.41002277904328016, "grad_norm": 1.2593162041954078, "learning_rate": 9.959365197965824e-06, "loss": 0.3476, "step": 90 }, { "epoch": 0.45558086560364464, "grad_norm": 1.0247420858839205, "learning_rate": 9.918559558613344e-06, "loss": 0.3281, "step": 100 }, { "epoch": 0.5011389521640092, "grad_norm": 1.1609463270495288, "learning_rate": 9.863858858486736e-06, "loss": 0.3213, "step": 110 }, { "epoch": 0.5466970387243736, "grad_norm": 1.112608179278684, "learning_rate": 9.795417628509857e-06, "loss": 0.3226, "step": 120 }, { "epoch": 0.592255125284738, "grad_norm": 1.1442245883128161, "learning_rate": 9.713429216966624e-06, "loss": 0.3058, "step": 130 }, { "epoch": 0.6378132118451025, "grad_norm": 1.0029316918162978, "learning_rate": 9.618125243286989e-06, "loss": 0.3114, "step": 140 }, { "epoch": 0.683371298405467, "grad_norm": 1.0408244849779362, "learning_rate": 9.50977494371594e-06, "loss": 0.3091, "step": 150 }, { "epoch": 0.7289293849658315, "grad_norm": 1.0789631799175963, "learning_rate": 9.388684410713977e-06, "loss": 0.3078, "step": 160 }, { "epoch": 0.7744874715261959, "grad_norm": 0.9628230660114916, "learning_rate": 9.255195728237837e-06, "loss": 0.3009, "step": 170 }, { "epoch": 0.8200455580865603, "grad_norm": 0.9722514254536355, "learning_rate": 9.109686005344258e-06, "loss": 0.2947, "step": 180 }, { "epoch": 0.8656036446469249, "grad_norm": 0.8777666899714902, "learning_rate": 8.952566310846931e-06, "loss": 0.2942, "step": 190 }, { "epoch": 0.9111617312072893, "grad_norm": 0.9667172893305681, "learning_rate": 8.784280512036235e-06, "loss": 0.289, "step": 200 }, { "epoch": 0.9567198177676538, "grad_norm": 0.9878477819397482, "learning_rate": 8.60530402074241e-06, "loss": 0.2868, "step": 210 }, { "epoch": 1.0, "grad_norm": 0.9727406585304033, "learning_rate": 8.416142450284565e-06, "loss": 0.2717, "step": 220 }, { "epoch": 1.0455580865603644, "grad_norm": 0.7915636741448634, "learning_rate": 8.217330187099689e-06, "loss": 0.1748, "step": 230 }, { "epoch": 1.0911161731207288, "grad_norm": 0.7837965142867958, "learning_rate": 8.009428881086836e-06, "loss": 0.1684, "step": 240 }, { "epoch": 1.1366742596810935, "grad_norm": 0.8666778827303715, "learning_rate": 7.793025858931317e-06, "loss": 0.1692, "step": 250 }, { "epoch": 1.182232346241458, "grad_norm": 0.849266584139581, "learning_rate": 7.568732464891293e-06, "loss": 0.1742, "step": 260 }, { "epoch": 1.2277904328018223, "grad_norm": 0.7696615783202124, "learning_rate": 7.33718233373407e-06, "loss": 0.1711, "step": 270 }, { "epoch": 1.2733485193621867, "grad_norm": 0.8661434772503379, "learning_rate": 7.099029600701144e-06, "loss": 0.1697, "step": 280 }, { "epoch": 1.3189066059225514, "grad_norm": 0.8221397461631786, "learning_rate": 6.854947053558849e-06, "loss": 0.1667, "step": 290 }, { "epoch": 1.3644646924829158, "grad_norm": 0.7959283850125798, "learning_rate": 6.6056242319551315e-06, "loss": 0.1646, "step": 300 }, { "epoch": 1.4100227790432802, "grad_norm": 0.7934911795062435, "learning_rate": 6.3517654794518156e-06, "loss": 0.1625, "step": 310 }, { "epoch": 1.4555808656036446, "grad_norm": 0.7890669930103191, "learning_rate": 6.094087953735423e-06, "loss": 0.1664, "step": 320 }, { "epoch": 1.501138952164009, "grad_norm": 0.849926754992192, "learning_rate": 5.8333196006277536e-06, "loss": 0.1608, "step": 330 }, { "epoch": 1.5466970387243735, "grad_norm": 0.7920622918925933, "learning_rate": 5.570197097619688e-06, "loss": 0.1589, "step": 340 }, { "epoch": 1.592255125284738, "grad_norm": 0.8974340210426586, "learning_rate": 5.305463772737812e-06, "loss": 0.16, "step": 350 }, { "epoch": 1.6378132118451025, "grad_norm": 0.8910155901512489, "learning_rate": 5.039867504623084e-06, "loss": 0.1661, "step": 360 }, { "epoch": 1.683371298405467, "grad_norm": 0.8438540298957333, "learning_rate": 4.774158609753908e-06, "loss": 0.1498, "step": 370 }, { "epoch": 1.7289293849658316, "grad_norm": 0.8003749509299389, "learning_rate": 4.5090877227822424e-06, "loss": 0.1558, "step": 380 }, { "epoch": 1.774487471526196, "grad_norm": 0.7879659477066289, "learning_rate": 4.245403675970877e-06, "loss": 0.1583, "step": 390 }, { "epoch": 1.8200455580865604, "grad_norm": 0.7653210529786019, "learning_rate": 3.9838513837224814e-06, "loss": 0.1495, "step": 400 }, { "epoch": 1.8656036446469249, "grad_norm": 0.8205629178189233, "learning_rate": 3.7251697381767373e-06, "loss": 0.152, "step": 410 }, { "epoch": 1.9111617312072893, "grad_norm": 0.8015055799282049, "learning_rate": 3.4700895218205026e-06, "loss": 0.1454, "step": 420 }, { "epoch": 1.9567198177676537, "grad_norm": 0.7703178443216857, "learning_rate": 3.2193313430079737e-06, "loss": 0.1456, "step": 430 }, { "epoch": 2.0, "grad_norm": 0.7965347444065239, "learning_rate": 2.9736036002230332e-06, "loss": 0.1401, "step": 440 }, { "epoch": 2.0455580865603644, "grad_norm": 0.7224665974614691, "learning_rate": 2.7336004808348094e-06, "loss": 0.0746, "step": 450 }, { "epoch": 2.091116173120729, "grad_norm": 0.6344834885792635, "learning_rate": 2.5000000000000015e-06, "loss": 0.0704, "step": 460 }, { "epoch": 2.1366742596810933, "grad_norm": 0.6333151400219024, "learning_rate": 2.273462085252146e-06, "loss": 0.0719, "step": 470 }, { "epoch": 2.1822323462414577, "grad_norm": 0.6345496356388078, "learning_rate": 2.0546267121888863e-06, "loss": 0.0683, "step": 480 }, { "epoch": 2.2277904328018225, "grad_norm": 0.663831749653779, "learning_rate": 1.8441120965239912e-06, "loss": 0.0707, "step": 490 }, { "epoch": 2.273348519362187, "grad_norm": 0.6978500583302198, "learning_rate": 1.642512947611622e-06, "loss": 0.0719, "step": 500 }, { "epoch": 2.3189066059225514, "grad_norm": 0.5899803755413382, "learning_rate": 1.4503987883766857e-06, "loss": 0.0655, "step": 510 }, { "epoch": 2.364464692482916, "grad_norm": 0.597275435306883, "learning_rate": 1.2683123463975144e-06, "loss": 0.0658, "step": 520 }, { "epoch": 2.41002277904328, "grad_norm": 0.6130036081797414, "learning_rate": 1.0967680206861198e-06, "loss": 0.0691, "step": 530 }, { "epoch": 2.4555808656036446, "grad_norm": 0.6291943478576331, "learning_rate": 9.362504284973683e-07, "loss": 0.0701, "step": 540 }, { "epoch": 2.501138952164009, "grad_norm": 0.6719915377398242, "learning_rate": 7.872130362724422e-07, "loss": 0.0689, "step": 550 }, { "epoch": 2.5466970387243735, "grad_norm": 0.6866035161561178, "learning_rate": 6.500768785841482e-07, "loss": 0.065, "step": 560 }, { "epoch": 2.592255125284738, "grad_norm": 0.6382818453339463, "learning_rate": 5.252293687031196e-07, "loss": 0.0661, "step": 570 }, { "epoch": 2.6378132118451028, "grad_norm": 0.6023391864806479, "learning_rate": 4.130232041450866e-07, "loss": 0.0613, "step": 580 }, { "epoch": 2.6833712984054667, "grad_norm": 0.6283362244939039, "learning_rate": 3.1377537029107174e-07, "loss": 0.065, "step": 590 }, { "epoch": 2.7289293849658316, "grad_norm": 0.6324038348983413, "learning_rate": 2.2776624489530664e-07, "loss": 0.0662, "step": 600 }, { "epoch": 2.774487471526196, "grad_norm": 0.6463288010084847, "learning_rate": 1.55238806010668e-07, "loss": 0.0637, "step": 610 }, { "epoch": 2.8200455580865604, "grad_norm": 0.5961299692551064, "learning_rate": 9.639794556925041e-08, "loss": 0.0645, "step": 620 }, { "epoch": 2.865603644646925, "grad_norm": 0.6588358857173194, "learning_rate": 5.1409890557246876e-08, "loss": 0.0644, "step": 630 }, { "epoch": 2.9111617312072893, "grad_norm": 0.5976452121379741, "learning_rate": 2.0401733419315727e-08, "loss": 0.0626, "step": 640 }, { "epoch": 2.9567198177676537, "grad_norm": 0.6600908299010216, "learning_rate": 3.4610730190648423e-09, "loss": 0.0641, "step": 650 } ], "logging_steps": 10, "max_steps": 657, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 418097876697088.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }