AFM-Judge-Step-39-V1 / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
0b9cd8e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.923076923076923,
"eval_steps": 500,
"global_step": 39,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.3076923076923077,
"grad_norm": 429.5760192871094,
"learning_rate": 5e-06,
"loss": 1.9642,
"memory/device_mem_reserved(gib)": 51.52,
"memory/max_mem_active(gib)": 46.79,
"memory/max_mem_allocated(gib)": 45.85,
"step": 1
},
{
"epoch": 0.6153846153846154,
"grad_norm": 88.243408203125,
"learning_rate": 1e-05,
"loss": 1.43,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 2
},
{
"epoch": 0.9230769230769231,
"grad_norm": 188.16270446777344,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.1989,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 3
},
{
"epoch": 1.0,
"grad_norm": 238.33164978027344,
"learning_rate": 2e-05,
"loss": 1.4779,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 4
},
{
"epoch": 1.3076923076923077,
"grad_norm": 60.53327178955078,
"learning_rate": 1.9953596287703015e-05,
"loss": 1.4517,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 5
},
{
"epoch": 1.6153846153846154,
"grad_norm": 89.0710678100586,
"learning_rate": 1.9905213270142184e-05,
"loss": 1.1788,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 6
},
{
"epoch": 1.9230769230769231,
"grad_norm": 74.55708312988281,
"learning_rate": 1.9854721549636805e-05,
"loss": 1.3326,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 7
},
{
"epoch": 2.0,
"grad_norm": 46.6354866027832,
"learning_rate": 1.9801980198019806e-05,
"loss": 1.1162,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 8
},
{
"epoch": 2.3076923076923075,
"grad_norm": 11.567599296569824,
"learning_rate": 1.974683544303798e-05,
"loss": 1.0117,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 9
},
{
"epoch": 2.6153846153846154,
"grad_norm": 15.884381294250488,
"learning_rate": 1.9689119170984456e-05,
"loss": 0.9432,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 10
},
{
"epoch": 2.9230769230769234,
"grad_norm": 8.479954719543457,
"learning_rate": 1.9628647214854114e-05,
"loss": 0.9247,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 11
},
{
"epoch": 3.0,
"grad_norm": 17.43779754638672,
"learning_rate": 1.956521739130435e-05,
"loss": 1.0189,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 12
},
{
"epoch": 3.3076923076923075,
"grad_norm": 8.543362617492676,
"learning_rate": 1.9498607242339832e-05,
"loss": 0.9282,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 13
},
{
"epoch": 3.6153846153846154,
"grad_norm": 5.215980052947998,
"learning_rate": 1.942857142857143e-05,
"loss": 0.8694,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 14
},
{
"epoch": 3.9230769230769234,
"grad_norm": 4.073164463043213,
"learning_rate": 1.935483870967742e-05,
"loss": 0.8029,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 15
},
{
"epoch": 4.0,
"grad_norm": 5.382778167724609,
"learning_rate": 1.9277108433734944e-05,
"loss": 0.7638,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 16
},
{
"epoch": 4.3076923076923075,
"grad_norm": 4.377191066741943,
"learning_rate": 1.9195046439628485e-05,
"loss": 0.7751,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 17
},
{
"epoch": 4.615384615384615,
"grad_norm": 3.5090882778167725,
"learning_rate": 1.9108280254777068e-05,
"loss": 0.731,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 18
},
{
"epoch": 4.923076923076923,
"grad_norm": 4.811877727508545,
"learning_rate": 1.9016393442622952e-05,
"loss": 0.7118,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 19
},
{
"epoch": 5.0,
"grad_norm": 4.822802543640137,
"learning_rate": 1.891891891891892e-05,
"loss": 0.6149,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 20
},
{
"epoch": 5.3076923076923075,
"grad_norm": 4.164008140563965,
"learning_rate": 1.8815331010452963e-05,
"loss": 0.6936,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 21
},
{
"epoch": 5.615384615384615,
"grad_norm": 3.9381167888641357,
"learning_rate": 1.8705035971223024e-05,
"loss": 0.6502,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 22
},
{
"epoch": 5.923076923076923,
"grad_norm": 3.9260995388031006,
"learning_rate": 1.858736059479554e-05,
"loss": 0.633,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 23
},
{
"epoch": 6.0,
"grad_norm": 5.373617649078369,
"learning_rate": 1.846153846153846e-05,
"loss": 0.4986,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 24
},
{
"epoch": 6.3076923076923075,
"grad_norm": 4.031383514404297,
"learning_rate": 1.8326693227091633e-05,
"loss": 0.6118,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 25
},
{
"epoch": 6.615384615384615,
"grad_norm": 4.3576436042785645,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.5786,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 26
},
{
"epoch": 6.923076923076923,
"grad_norm": 3.389698028564453,
"learning_rate": 1.8025751072961374e-05,
"loss": 0.569,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 27
},
{
"epoch": 7.0,
"grad_norm": 3.289379596710205,
"learning_rate": 1.785714285714286e-05,
"loss": 0.3683,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 28
},
{
"epoch": 7.3076923076923075,
"grad_norm": 3.358076333999634,
"learning_rate": 1.7674418604651163e-05,
"loss": 0.5345,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 29
},
{
"epoch": 7.615384615384615,
"grad_norm": 3.419854164123535,
"learning_rate": 1.7475728155339808e-05,
"loss": 0.5051,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 30
},
{
"epoch": 7.923076923076923,
"grad_norm": 3.624562978744507,
"learning_rate": 1.7258883248730966e-05,
"loss": 0.4952,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 31
},
{
"epoch": 8.0,
"grad_norm": 3.478980541229248,
"learning_rate": 1.7021276595744686e-05,
"loss": 0.2747,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 32
},
{
"epoch": 8.307692307692308,
"grad_norm": 3.6138954162597656,
"learning_rate": 1.675977653631285e-05,
"loss": 0.4662,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 33
},
{
"epoch": 8.615384615384615,
"grad_norm": 3.260951042175293,
"learning_rate": 1.647058823529412e-05,
"loss": 0.4351,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 34
},
{
"epoch": 8.923076923076923,
"grad_norm": 4.216782093048096,
"learning_rate": 1.6149068322981367e-05,
"loss": 0.4366,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 35
},
{
"epoch": 9.0,
"grad_norm": 3.045259475708008,
"learning_rate": 1.5789473684210526e-05,
"loss": 0.2067,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 36
},
{
"epoch": 9.307692307692308,
"grad_norm": 3.721900463104248,
"learning_rate": 1.5384615384615384e-05,
"loss": 0.3979,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 37
},
{
"epoch": 9.615384615384615,
"grad_norm": 4.223248481750488,
"learning_rate": 1.4925373134328359e-05,
"loss": 0.3831,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 38
},
{
"epoch": 9.923076923076923,
"grad_norm": 4.570195198059082,
"learning_rate": 1.4400000000000001e-05,
"loss": 0.3919,
"memory/device_mem_reserved(gib)": 51.53,
"memory/max_mem_active(gib)": 46.82,
"memory/max_mem_allocated(gib)": 45.88,
"step": 39
}
],
"logging_steps": 1,
"max_steps": 48,
"num_input_tokens_seen": 0,
"num_train_epochs": 12,
"save_steps": 3,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.482969594438615e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}