VocalNet-Qwen3-8B / trainer_state.json
SandO114's picture
Upload folder using huggingface_hub
15b5399 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1431,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006988120195667365,
"grad_norm": 20.01421639294812,
"learning_rate": 0.0,
"loss": 26.1109,
"num_tokens": 7466.0,
"step": 1
},
{
"epoch": 0.001397624039133473,
"grad_norm": 15.36930185770397,
"learning_rate": 4.651162790697674e-06,
"loss": 26.1453,
"num_tokens": 14867.0,
"step": 2
},
{
"epoch": 0.0020964360587002098,
"grad_norm": 17.05352336534915,
"learning_rate": 9.302325581395349e-06,
"loss": 26.0888,
"num_tokens": 22758.0,
"step": 3
},
{
"epoch": 0.002795248078266946,
"grad_norm": 19.04711212026682,
"learning_rate": 1.3953488372093024e-05,
"loss": 25.6631,
"num_tokens": 30251.0,
"step": 4
},
{
"epoch": 0.0034940600978336828,
"grad_norm": 28.812900832385424,
"learning_rate": 1.8604651162790697e-05,
"loss": 25.3928,
"num_tokens": 37442.0,
"step": 5
},
{
"epoch": 0.0041928721174004195,
"grad_norm": 63.640698917817346,
"learning_rate": 2.3255813953488374e-05,
"loss": 26.0258,
"num_tokens": 44450.0,
"step": 6
},
{
"epoch": 0.004891684136967156,
"grad_norm": 60.46179037964818,
"learning_rate": 2.7906976744186048e-05,
"loss": 25.6428,
"num_tokens": 51596.0,
"step": 7
},
{
"epoch": 0.005590496156533892,
"grad_norm": 31.396116729467284,
"learning_rate": 3.2558139534883724e-05,
"loss": 24.7731,
"num_tokens": 58501.0,
"step": 8
},
{
"epoch": 0.006289308176100629,
"grad_norm": 33.63353151516081,
"learning_rate": 3.7209302325581394e-05,
"loss": 23.8918,
"num_tokens": 65085.0,
"step": 9
},
{
"epoch": 0.0069881201956673656,
"grad_norm": 22.00249182717323,
"learning_rate": 4.186046511627907e-05,
"loss": 23.6714,
"num_tokens": 72520.0,
"step": 10
},
{
"epoch": 0.007686932215234102,
"grad_norm": 21.52192510277941,
"learning_rate": 4.651162790697675e-05,
"loss": 23.4993,
"num_tokens": 79490.0,
"step": 11
},
{
"epoch": 0.008385744234800839,
"grad_norm": 18.356839468525436,
"learning_rate": 5.1162790697674425e-05,
"loss": 23.458,
"num_tokens": 86902.0,
"step": 12
},
{
"epoch": 0.009084556254367574,
"grad_norm": 11.54924819578326,
"learning_rate": 5.5813953488372095e-05,
"loss": 22.736,
"num_tokens": 92738.0,
"step": 13
},
{
"epoch": 0.009783368273934312,
"grad_norm": 36.52460306155796,
"learning_rate": 6.0465116279069765e-05,
"loss": 23.2197,
"num_tokens": 100060.0,
"step": 14
},
{
"epoch": 0.010482180293501049,
"grad_norm": 12.549039640154193,
"learning_rate": 6.511627906976745e-05,
"loss": 21.9672,
"num_tokens": 107649.0,
"step": 15
},
{
"epoch": 0.011180992313067784,
"grad_norm": 6.65450263785363,
"learning_rate": 6.976744186046513e-05,
"loss": 21.4422,
"num_tokens": 114327.0,
"step": 16
},
{
"epoch": 0.011879804332634521,
"grad_norm": 26.60725050433419,
"learning_rate": 7.441860465116279e-05,
"loss": 21.7565,
"num_tokens": 121566.0,
"step": 17
},
{
"epoch": 0.012578616352201259,
"grad_norm": 14.909109212430055,
"learning_rate": 7.906976744186047e-05,
"loss": 21.4321,
"num_tokens": 128982.0,
"step": 18
},
{
"epoch": 0.013277428371767994,
"grad_norm": 8.850594729109151,
"learning_rate": 8.372093023255814e-05,
"loss": 21.2315,
"num_tokens": 136536.0,
"step": 19
},
{
"epoch": 0.013976240391334731,
"grad_norm": 4.64153452983114,
"learning_rate": 8.837209302325582e-05,
"loss": 20.9173,
"num_tokens": 144292.0,
"step": 20
},
{
"epoch": 0.014675052410901468,
"grad_norm": 7.1557479804922774,
"learning_rate": 9.30232558139535e-05,
"loss": 20.6413,
"num_tokens": 151133.0,
"step": 21
},
{
"epoch": 0.015373864430468204,
"grad_norm": 12.58835350216716,
"learning_rate": 9.767441860465116e-05,
"loss": 20.6017,
"num_tokens": 158252.0,
"step": 22
},
{
"epoch": 0.01607267645003494,
"grad_norm": 4.738505709948937,
"learning_rate": 0.00010232558139534885,
"loss": 20.2481,
"num_tokens": 165457.0,
"step": 23
},
{
"epoch": 0.016771488469601678,
"grad_norm": 3.7959176276381172,
"learning_rate": 0.00010697674418604651,
"loss": 20.0811,
"num_tokens": 172785.0,
"step": 24
},
{
"epoch": 0.017470300489168415,
"grad_norm": 11.694063836440657,
"learning_rate": 0.00011162790697674419,
"loss": 20.1969,
"num_tokens": 179494.0,
"step": 25
},
{
"epoch": 0.01816911250873515,
"grad_norm": 9.55869849357741,
"learning_rate": 0.00011627906976744187,
"loss": 20.2425,
"num_tokens": 187501.0,
"step": 26
},
{
"epoch": 0.018867924528301886,
"grad_norm": 4.6654736532723255,
"learning_rate": 0.00012093023255813953,
"loss": 20.1472,
"num_tokens": 194486.0,
"step": 27
},
{
"epoch": 0.019566736547868623,
"grad_norm": 3.543295429416166,
"learning_rate": 0.0001255813953488372,
"loss": 19.6634,
"num_tokens": 201489.0,
"step": 28
},
{
"epoch": 0.02026554856743536,
"grad_norm": 4.256858179676704,
"learning_rate": 0.0001302325581395349,
"loss": 19.6026,
"num_tokens": 209354.0,
"step": 29
},
{
"epoch": 0.020964360587002098,
"grad_norm": 6.097755497420057,
"learning_rate": 0.00013488372093023256,
"loss": 19.3001,
"num_tokens": 216345.0,
"step": 30
},
{
"epoch": 0.02166317260656883,
"grad_norm": 3.5691978934846538,
"learning_rate": 0.00013953488372093025,
"loss": 19.4532,
"num_tokens": 223037.0,
"step": 31
},
{
"epoch": 0.02236198462613557,
"grad_norm": 3.9279818551475314,
"learning_rate": 0.00014418604651162791,
"loss": 19.4291,
"num_tokens": 230416.0,
"step": 32
},
{
"epoch": 0.023060796645702306,
"grad_norm": 3.2960834272756525,
"learning_rate": 0.00014883720930232558,
"loss": 18.919,
"num_tokens": 237115.0,
"step": 33
},
{
"epoch": 0.023759608665269043,
"grad_norm": 3.334129318532873,
"learning_rate": 0.00015348837209302327,
"loss": 18.7862,
"num_tokens": 244240.0,
"step": 34
},
{
"epoch": 0.02445842068483578,
"grad_norm": 4.663518875645515,
"learning_rate": 0.00015813953488372093,
"loss": 18.9105,
"num_tokens": 251723.0,
"step": 35
},
{
"epoch": 0.025157232704402517,
"grad_norm": 3.0967131320916286,
"learning_rate": 0.00016279069767441862,
"loss": 18.3404,
"num_tokens": 258362.0,
"step": 36
},
{
"epoch": 0.02585604472396925,
"grad_norm": 3.47077766113365,
"learning_rate": 0.00016744186046511629,
"loss": 18.5825,
"num_tokens": 265593.0,
"step": 37
},
{
"epoch": 0.026554856743535988,
"grad_norm": 3.2117310830547514,
"learning_rate": 0.00017209302325581395,
"loss": 17.9212,
"num_tokens": 272624.0,
"step": 38
},
{
"epoch": 0.027253668763102725,
"grad_norm": 5.4619700154340665,
"learning_rate": 0.00017674418604651164,
"loss": 18.594,
"num_tokens": 279181.0,
"step": 39
},
{
"epoch": 0.027952480782669462,
"grad_norm": 3.404742927998481,
"learning_rate": 0.0001813953488372093,
"loss": 18.1244,
"num_tokens": 285979.0,
"step": 40
},
{
"epoch": 0.0286512928022362,
"grad_norm": 2.6009044708722584,
"learning_rate": 0.000186046511627907,
"loss": 18.0623,
"num_tokens": 293070.0,
"step": 41
},
{
"epoch": 0.029350104821802937,
"grad_norm": 3.109662691913911,
"learning_rate": 0.00019069767441860466,
"loss": 17.4661,
"num_tokens": 299558.0,
"step": 42
},
{
"epoch": 0.03004891684136967,
"grad_norm": 3.648517834649003,
"learning_rate": 0.00019534883720930232,
"loss": 17.6153,
"num_tokens": 306733.0,
"step": 43
},
{
"epoch": 0.030747728860936407,
"grad_norm": 4.72227718473326,
"learning_rate": 0.0002,
"loss": 17.4022,
"num_tokens": 313625.0,
"step": 44
},
{
"epoch": 0.031446540880503145,
"grad_norm": 3.624756224705375,
"learning_rate": 0.00019999974385219888,
"loss": 17.382,
"num_tokens": 320835.0,
"step": 45
},
{
"epoch": 0.03214535290006988,
"grad_norm": 3.4671960501465815,
"learning_rate": 0.00019999897541010772,
"loss": 16.9532,
"num_tokens": 328348.0,
"step": 46
},
{
"epoch": 0.03284416491963662,
"grad_norm": 3.103292868331405,
"learning_rate": 0.00019999769467766323,
"loss": 16.7449,
"num_tokens": 335913.0,
"step": 47
},
{
"epoch": 0.033542976939203356,
"grad_norm": 4.264418554780992,
"learning_rate": 0.00019999590166142655,
"loss": 16.7949,
"num_tokens": 343205.0,
"step": 48
},
{
"epoch": 0.03424178895877009,
"grad_norm": 2.9976124716737447,
"learning_rate": 0.0001999935963705832,
"loss": 16.4737,
"num_tokens": 350261.0,
"step": 49
},
{
"epoch": 0.03494060097833683,
"grad_norm": 5.4690748444855455,
"learning_rate": 0.0001999907788169431,
"loss": 16.8885,
"num_tokens": 357367.0,
"step": 50
},
{
"epoch": 0.03563941299790356,
"grad_norm": 3.093190984373506,
"learning_rate": 0.00019998744901494049,
"loss": 16.2166,
"num_tokens": 364705.0,
"step": 51
},
{
"epoch": 0.0363382250174703,
"grad_norm": 3.231011659915006,
"learning_rate": 0.00019998360698163375,
"loss": 16.3892,
"num_tokens": 371422.0,
"step": 52
},
{
"epoch": 0.037037037037037035,
"grad_norm": 3.267414032427895,
"learning_rate": 0.00019997925273670543,
"loss": 16.1599,
"num_tokens": 378970.0,
"step": 53
},
{
"epoch": 0.03773584905660377,
"grad_norm": 3.249781757856964,
"learning_rate": 0.0001999743863024622,
"loss": 16.0245,
"num_tokens": 386189.0,
"step": 54
},
{
"epoch": 0.03843466107617051,
"grad_norm": 2.7793521009997457,
"learning_rate": 0.00019996900770383454,
"loss": 16.2024,
"num_tokens": 393815.0,
"step": 55
},
{
"epoch": 0.039133473095737246,
"grad_norm": 2.3740220719738527,
"learning_rate": 0.0001999631169683768,
"loss": 15.9862,
"num_tokens": 399923.0,
"step": 56
},
{
"epoch": 0.039832285115303984,
"grad_norm": 2.504515937578137,
"learning_rate": 0.0001999567141262669,
"loss": 15.8262,
"num_tokens": 406458.0,
"step": 57
},
{
"epoch": 0.04053109713487072,
"grad_norm": 2.585577847048995,
"learning_rate": 0.0001999497992103064,
"loss": 15.8221,
"num_tokens": 414710.0,
"step": 58
},
{
"epoch": 0.04122990915443746,
"grad_norm": 4.077863543627188,
"learning_rate": 0.00019994237225592012,
"loss": 15.8001,
"num_tokens": 422447.0,
"step": 59
},
{
"epoch": 0.041928721174004195,
"grad_norm": 2.3612560688538227,
"learning_rate": 0.00019993443330115592,
"loss": 15.4257,
"num_tokens": 428907.0,
"step": 60
},
{
"epoch": 0.04262753319357093,
"grad_norm": 3.1342939019471863,
"learning_rate": 0.0001999259823866848,
"loss": 15.5804,
"num_tokens": 436247.0,
"step": 61
},
{
"epoch": 0.04332634521313766,
"grad_norm": 2.667474376403426,
"learning_rate": 0.0001999170195558004,
"loss": 15.2497,
"num_tokens": 443409.0,
"step": 62
},
{
"epoch": 0.0440251572327044,
"grad_norm": 2.94929141586954,
"learning_rate": 0.0001999075448544189,
"loss": 15.2836,
"num_tokens": 450956.0,
"step": 63
},
{
"epoch": 0.04472396925227114,
"grad_norm": 4.115184597341321,
"learning_rate": 0.00019989755833107876,
"loss": 15.0676,
"num_tokens": 458417.0,
"step": 64
},
{
"epoch": 0.045422781271837874,
"grad_norm": 2.2936726283661093,
"learning_rate": 0.00019988706003694055,
"loss": 15.1663,
"num_tokens": 465196.0,
"step": 65
},
{
"epoch": 0.04612159329140461,
"grad_norm": 3.263296116111796,
"learning_rate": 0.00019987605002578653,
"loss": 14.9832,
"num_tokens": 472489.0,
"step": 66
},
{
"epoch": 0.04682040531097135,
"grad_norm": 2.5788902845272443,
"learning_rate": 0.0001998645283540205,
"loss": 14.8966,
"num_tokens": 479315.0,
"step": 67
},
{
"epoch": 0.047519217330538085,
"grad_norm": 2.631994580040522,
"learning_rate": 0.00019985249508066755,
"loss": 15.1958,
"num_tokens": 486906.0,
"step": 68
},
{
"epoch": 0.04821802935010482,
"grad_norm": 2.5782733328645304,
"learning_rate": 0.0001998399502673735,
"loss": 14.796,
"num_tokens": 493938.0,
"step": 69
},
{
"epoch": 0.04891684136967156,
"grad_norm": 3.0213769951549816,
"learning_rate": 0.00019982689397840496,
"loss": 14.9338,
"num_tokens": 500590.0,
"step": 70
},
{
"epoch": 0.0496156533892383,
"grad_norm": 3.5678659064272087,
"learning_rate": 0.00019981332628064865,
"loss": 14.9063,
"num_tokens": 508345.0,
"step": 71
},
{
"epoch": 0.050314465408805034,
"grad_norm": 2.313425627629096,
"learning_rate": 0.0001997992472436114,
"loss": 14.7596,
"num_tokens": 515272.0,
"step": 72
},
{
"epoch": 0.05101327742837177,
"grad_norm": 2.4067823507135753,
"learning_rate": 0.0001997846569394194,
"loss": 14.7652,
"num_tokens": 522104.0,
"step": 73
},
{
"epoch": 0.0517120894479385,
"grad_norm": 2.2152061116996364,
"learning_rate": 0.00019976955544281815,
"loss": 14.6565,
"num_tokens": 529458.0,
"step": 74
},
{
"epoch": 0.05241090146750524,
"grad_norm": 3.2940129288988547,
"learning_rate": 0.000199753942831172,
"loss": 14.3745,
"num_tokens": 537278.0,
"step": 75
},
{
"epoch": 0.053109713487071976,
"grad_norm": 3.2872893577085933,
"learning_rate": 0.0001997378191844636,
"loss": 14.654,
"num_tokens": 544549.0,
"step": 76
},
{
"epoch": 0.05380852550663871,
"grad_norm": 3.4280215978452544,
"learning_rate": 0.00019972118458529375,
"loss": 14.2309,
"num_tokens": 551068.0,
"step": 77
},
{
"epoch": 0.05450733752620545,
"grad_norm": 2.064355698482165,
"learning_rate": 0.00019970403911888078,
"loss": 14.5744,
"num_tokens": 558188.0,
"step": 78
},
{
"epoch": 0.05520614954577219,
"grad_norm": 2.716813836457923,
"learning_rate": 0.0001996863828730601,
"loss": 14.4787,
"num_tokens": 565805.0,
"step": 79
},
{
"epoch": 0.055904961565338925,
"grad_norm": 2.4118756690009997,
"learning_rate": 0.00019966821593828392,
"loss": 14.2957,
"num_tokens": 573381.0,
"step": 80
},
{
"epoch": 0.05660377358490566,
"grad_norm": 2.4955272028998454,
"learning_rate": 0.0001996495384076206,
"loss": 14.1731,
"num_tokens": 581350.0,
"step": 81
},
{
"epoch": 0.0573025856044724,
"grad_norm": 2.3646224597467445,
"learning_rate": 0.0001996303503767544,
"loss": 14.1787,
"num_tokens": 588473.0,
"step": 82
},
{
"epoch": 0.058001397624039136,
"grad_norm": 2.9154207825942637,
"learning_rate": 0.00019961065194398466,
"loss": 14.0221,
"num_tokens": 595623.0,
"step": 83
},
{
"epoch": 0.05870020964360587,
"grad_norm": 2.144505504814399,
"learning_rate": 0.00019959044321022563,
"loss": 14.3507,
"num_tokens": 602999.0,
"step": 84
},
{
"epoch": 0.0593990216631726,
"grad_norm": 2.32919869196643,
"learning_rate": 0.00019956972427900578,
"loss": 13.9408,
"num_tokens": 610543.0,
"step": 85
},
{
"epoch": 0.06009783368273934,
"grad_norm": 2.744731912474738,
"learning_rate": 0.00019954849525646726,
"loss": 13.917,
"num_tokens": 617203.0,
"step": 86
},
{
"epoch": 0.06079664570230608,
"grad_norm": 3.0081451482892416,
"learning_rate": 0.0001995267562513654,
"loss": 14.4755,
"num_tokens": 623602.0,
"step": 87
},
{
"epoch": 0.061495457721872815,
"grad_norm": 2.0337097238095176,
"learning_rate": 0.00019950450737506824,
"loss": 14.0758,
"num_tokens": 631160.0,
"step": 88
},
{
"epoch": 0.06219426974143955,
"grad_norm": 1.872341236813854,
"learning_rate": 0.00019948174874155573,
"loss": 13.6134,
"num_tokens": 638355.0,
"step": 89
},
{
"epoch": 0.06289308176100629,
"grad_norm": 1.8676837839342457,
"learning_rate": 0.00019945848046741934,
"loss": 13.8425,
"num_tokens": 645362.0,
"step": 90
},
{
"epoch": 0.06359189378057302,
"grad_norm": 1.7615381904391172,
"learning_rate": 0.00019943470267186144,
"loss": 13.7478,
"num_tokens": 652539.0,
"step": 91
},
{
"epoch": 0.06429070580013976,
"grad_norm": 2.3029110752387307,
"learning_rate": 0.00019941041547669465,
"loss": 13.9631,
"num_tokens": 659738.0,
"step": 92
},
{
"epoch": 0.0649895178197065,
"grad_norm": 2.0875724707236114,
"learning_rate": 0.0001993856190063412,
"loss": 13.7651,
"num_tokens": 667862.0,
"step": 93
},
{
"epoch": 0.06568832983927324,
"grad_norm": 2.8116799014033287,
"learning_rate": 0.00019936031338783225,
"loss": 13.9233,
"num_tokens": 675183.0,
"step": 94
},
{
"epoch": 0.06638714185883997,
"grad_norm": 2.1242762146916303,
"learning_rate": 0.00019933449875080746,
"loss": 13.5429,
"num_tokens": 682199.0,
"step": 95
},
{
"epoch": 0.06708595387840671,
"grad_norm": 3.0916928086184865,
"learning_rate": 0.00019930817522751401,
"loss": 13.6249,
"num_tokens": 689412.0,
"step": 96
},
{
"epoch": 0.06778476589797344,
"grad_norm": 2.1525163646868517,
"learning_rate": 0.0001992813429528062,
"loss": 13.592,
"num_tokens": 696608.0,
"step": 97
},
{
"epoch": 0.06848357791754019,
"grad_norm": 2.5580695598792955,
"learning_rate": 0.0001992540020641446,
"loss": 13.4303,
"num_tokens": 703838.0,
"step": 98
},
{
"epoch": 0.06918238993710692,
"grad_norm": 2.2127195967055884,
"learning_rate": 0.0001992261527015953,
"loss": 13.6424,
"num_tokens": 711432.0,
"step": 99
},
{
"epoch": 0.06988120195667366,
"grad_norm": 2.092591387075435,
"learning_rate": 0.00019919779500782948,
"loss": 13.6159,
"num_tokens": 717755.0,
"step": 100
},
{
"epoch": 0.07058001397624039,
"grad_norm": 2.649063124260544,
"learning_rate": 0.0001991689291281223,
"loss": 13.7373,
"num_tokens": 725381.0,
"step": 101
},
{
"epoch": 0.07127882599580712,
"grad_norm": 2.058843564636829,
"learning_rate": 0.00019913955521035234,
"loss": 13.2791,
"num_tokens": 732317.0,
"step": 102
},
{
"epoch": 0.07197763801537387,
"grad_norm": 2.003927168402966,
"learning_rate": 0.00019910967340500094,
"loss": 13.5031,
"num_tokens": 739043.0,
"step": 103
},
{
"epoch": 0.0726764500349406,
"grad_norm": 1.7028539422821216,
"learning_rate": 0.00019907928386515126,
"loss": 13.4382,
"num_tokens": 745729.0,
"step": 104
},
{
"epoch": 0.07337526205450734,
"grad_norm": 2.2764500816492568,
"learning_rate": 0.00019904838674648763,
"loss": 13.3326,
"num_tokens": 753195.0,
"step": 105
},
{
"epoch": 0.07407407407407407,
"grad_norm": 2.09678454014234,
"learning_rate": 0.00019901698220729458,
"loss": 13.5748,
"num_tokens": 759938.0,
"step": 106
},
{
"epoch": 0.07477288609364081,
"grad_norm": 2.2370783473533984,
"learning_rate": 0.00019898507040845616,
"loss": 13.2876,
"num_tokens": 767625.0,
"step": 107
},
{
"epoch": 0.07547169811320754,
"grad_norm": 1.922904626284315,
"learning_rate": 0.00019895265151345518,
"loss": 13.1701,
"num_tokens": 775131.0,
"step": 108
},
{
"epoch": 0.07617051013277429,
"grad_norm": 1.6799896369866845,
"learning_rate": 0.00019891972568837214,
"loss": 13.2668,
"num_tokens": 782395.0,
"step": 109
},
{
"epoch": 0.07686932215234102,
"grad_norm": 2.3877041372388272,
"learning_rate": 0.00019888629310188465,
"loss": 13.3328,
"num_tokens": 789064.0,
"step": 110
},
{
"epoch": 0.07756813417190776,
"grad_norm": 1.9592142247636106,
"learning_rate": 0.00019885235392526636,
"loss": 13.2284,
"num_tokens": 796698.0,
"step": 111
},
{
"epoch": 0.07826694619147449,
"grad_norm": 1.9080524611085286,
"learning_rate": 0.00019881790833238617,
"loss": 13.2042,
"num_tokens": 803919.0,
"step": 112
},
{
"epoch": 0.07896575821104122,
"grad_norm": 2.24974818928377,
"learning_rate": 0.00019878295649970734,
"loss": 13.1838,
"num_tokens": 810971.0,
"step": 113
},
{
"epoch": 0.07966457023060797,
"grad_norm": 2.1506162194631515,
"learning_rate": 0.0001987474986062866,
"loss": 13.0353,
"num_tokens": 817887.0,
"step": 114
},
{
"epoch": 0.0803633822501747,
"grad_norm": 1.7308970262493222,
"learning_rate": 0.00019871153483377315,
"loss": 13.0944,
"num_tokens": 824738.0,
"step": 115
},
{
"epoch": 0.08106219426974144,
"grad_norm": 1.9315001985846878,
"learning_rate": 0.0001986750653664078,
"loss": 13.1709,
"num_tokens": 832079.0,
"step": 116
},
{
"epoch": 0.08176100628930817,
"grad_norm": 2.260956956052656,
"learning_rate": 0.0001986380903910221,
"loss": 13.1032,
"num_tokens": 838908.0,
"step": 117
},
{
"epoch": 0.08245981830887492,
"grad_norm": 1.5924533402089038,
"learning_rate": 0.00019860061009703713,
"loss": 13.1797,
"num_tokens": 845348.0,
"step": 118
},
{
"epoch": 0.08315863032844165,
"grad_norm": 2.1606192319469115,
"learning_rate": 0.00019856262467646282,
"loss": 13.2937,
"num_tokens": 852162.0,
"step": 119
},
{
"epoch": 0.08385744234800839,
"grad_norm": 1.5684395978061723,
"learning_rate": 0.00019852413432389684,
"loss": 13.061,
"num_tokens": 860170.0,
"step": 120
},
{
"epoch": 0.08455625436757512,
"grad_norm": 2.0070206310239476,
"learning_rate": 0.00019848513923652358,
"loss": 13.0771,
"num_tokens": 867476.0,
"step": 121
},
{
"epoch": 0.08525506638714186,
"grad_norm": 1.5728708315681226,
"learning_rate": 0.00019844563961411309,
"loss": 12.9597,
"num_tokens": 874866.0,
"step": 122
},
{
"epoch": 0.0859538784067086,
"grad_norm": 2.162003809520903,
"learning_rate": 0.00019840563565902026,
"loss": 13.1082,
"num_tokens": 881774.0,
"step": 123
},
{
"epoch": 0.08665269042627533,
"grad_norm": 1.829836068226425,
"learning_rate": 0.00019836512757618355,
"loss": 13.0226,
"num_tokens": 888149.0,
"step": 124
},
{
"epoch": 0.08735150244584207,
"grad_norm": 2.2023507815241103,
"learning_rate": 0.00019832411557312414,
"loss": 13.311,
"num_tokens": 894693.0,
"step": 125
},
{
"epoch": 0.0880503144654088,
"grad_norm": 2.0453068368799863,
"learning_rate": 0.00019828259985994463,
"loss": 13.273,
"num_tokens": 901024.0,
"step": 126
},
{
"epoch": 0.08874912648497554,
"grad_norm": 1.4939550415133291,
"learning_rate": 0.00019824058064932831,
"loss": 13.0516,
"num_tokens": 908206.0,
"step": 127
},
{
"epoch": 0.08944793850454227,
"grad_norm": 2.2979259766990654,
"learning_rate": 0.00019819805815653768,
"loss": 13.1368,
"num_tokens": 914376.0,
"step": 128
},
{
"epoch": 0.09014675052410902,
"grad_norm": 1.8419979568421898,
"learning_rate": 0.00019815503259941358,
"loss": 13.0085,
"num_tokens": 921721.0,
"step": 129
},
{
"epoch": 0.09084556254367575,
"grad_norm": 1.8302796577604992,
"learning_rate": 0.0001981115041983741,
"loss": 12.9987,
"num_tokens": 928937.0,
"step": 130
},
{
"epoch": 0.09154437456324249,
"grad_norm": 1.6088442418792948,
"learning_rate": 0.0001980674731764133,
"loss": 12.9823,
"num_tokens": 936881.0,
"step": 131
},
{
"epoch": 0.09224318658280922,
"grad_norm": 1.9971274822057858,
"learning_rate": 0.00019802293975910016,
"loss": 13.0976,
"num_tokens": 943685.0,
"step": 132
},
{
"epoch": 0.09294199860237597,
"grad_norm": 1.5298730715886686,
"learning_rate": 0.00019797790417457742,
"loss": 12.8484,
"num_tokens": 950976.0,
"step": 133
},
{
"epoch": 0.0936408106219427,
"grad_norm": 2.0586082579521334,
"learning_rate": 0.0001979323666535604,
"loss": 13.0251,
"num_tokens": 957928.0,
"step": 134
},
{
"epoch": 0.09433962264150944,
"grad_norm": 1.6761450760295233,
"learning_rate": 0.00019788632742933585,
"loss": 12.877,
"num_tokens": 964412.0,
"step": 135
},
{
"epoch": 0.09503843466107617,
"grad_norm": 1.6881768033105196,
"learning_rate": 0.00019783978673776063,
"loss": 12.8942,
"num_tokens": 971468.0,
"step": 136
},
{
"epoch": 0.0957372466806429,
"grad_norm": 1.7144893089190771,
"learning_rate": 0.00019779274481726073,
"loss": 12.9506,
"num_tokens": 978459.0,
"step": 137
},
{
"epoch": 0.09643605870020965,
"grad_norm": 1.4983030666694932,
"learning_rate": 0.00019774520190882978,
"loss": 12.9223,
"num_tokens": 985920.0,
"step": 138
},
{
"epoch": 0.09713487071977638,
"grad_norm": 1.6874127706486612,
"learning_rate": 0.00019769715825602803,
"loss": 12.8427,
"num_tokens": 992764.0,
"step": 139
},
{
"epoch": 0.09783368273934312,
"grad_norm": 1.6763957290933016,
"learning_rate": 0.00019764861410498098,
"loss": 12.9646,
"num_tokens": 999854.0,
"step": 140
},
{
"epoch": 0.09853249475890985,
"grad_norm": 1.9957992886089357,
"learning_rate": 0.00019759956970437825,
"loss": 12.8047,
"num_tokens": 1006667.0,
"step": 141
},
{
"epoch": 0.0992313067784766,
"grad_norm": 1.5867823770505232,
"learning_rate": 0.00019755002530547208,
"loss": 12.9937,
"num_tokens": 1014089.0,
"step": 142
},
{
"epoch": 0.09993011879804332,
"grad_norm": 1.6071217458716918,
"learning_rate": 0.00019749998116207621,
"loss": 13.0317,
"num_tokens": 1020912.0,
"step": 143
},
{
"epoch": 0.10062893081761007,
"grad_norm": 1.733309225841312,
"learning_rate": 0.00019744943753056472,
"loss": 12.6994,
"num_tokens": 1028495.0,
"step": 144
},
{
"epoch": 0.1013277428371768,
"grad_norm": 1.5096215182516033,
"learning_rate": 0.0001973983946698703,
"loss": 12.7572,
"num_tokens": 1036332.0,
"step": 145
},
{
"epoch": 0.10202655485674354,
"grad_norm": 1.683071966779659,
"learning_rate": 0.0001973468528414833,
"loss": 12.8509,
"num_tokens": 1043895.0,
"step": 146
},
{
"epoch": 0.10272536687631027,
"grad_norm": 1.7014370401459447,
"learning_rate": 0.0001972948123094503,
"loss": 12.8687,
"num_tokens": 1051123.0,
"step": 147
},
{
"epoch": 0.103424178895877,
"grad_norm": 1.3443197503129412,
"learning_rate": 0.00019724227334037256,
"loss": 12.8757,
"num_tokens": 1058636.0,
"step": 148
},
{
"epoch": 0.10412299091544375,
"grad_norm": 1.690543644382228,
"learning_rate": 0.00019718923620340496,
"loss": 12.8779,
"num_tokens": 1065532.0,
"step": 149
},
{
"epoch": 0.10482180293501048,
"grad_norm": 1.3035032802273938,
"learning_rate": 0.00019713570117025443,
"loss": 12.7007,
"num_tokens": 1073092.0,
"step": 150
},
{
"epoch": 0.10552061495457722,
"grad_norm": 1.5307507354350658,
"learning_rate": 0.0001970816685151786,
"loss": 12.6439,
"num_tokens": 1081083.0,
"step": 151
},
{
"epoch": 0.10621942697414395,
"grad_norm": 1.4929379554646116,
"learning_rate": 0.00019702713851498435,
"loss": 12.7074,
"num_tokens": 1088405.0,
"step": 152
},
{
"epoch": 0.1069182389937107,
"grad_norm": 1.4933612831212741,
"learning_rate": 0.00019697211144902648,
"loss": 12.6299,
"num_tokens": 1095299.0,
"step": 153
},
{
"epoch": 0.10761705101327743,
"grad_norm": 1.5975456969431756,
"learning_rate": 0.00019691658759920624,
"loss": 12.7551,
"num_tokens": 1102272.0,
"step": 154
},
{
"epoch": 0.10831586303284417,
"grad_norm": 1.6091046101499575,
"learning_rate": 0.00019686056724996988,
"loss": 12.8102,
"num_tokens": 1108878.0,
"step": 155
},
{
"epoch": 0.1090146750524109,
"grad_norm": 1.4718644833323113,
"learning_rate": 0.00019680405068830717,
"loss": 12.748,
"num_tokens": 1116345.0,
"step": 156
},
{
"epoch": 0.10971348707197764,
"grad_norm": 1.672327371288932,
"learning_rate": 0.00019674703820374994,
"loss": 12.7993,
"num_tokens": 1123338.0,
"step": 157
},
{
"epoch": 0.11041229909154437,
"grad_norm": 1.6851443027272985,
"learning_rate": 0.0001966895300883707,
"loss": 12.6513,
"num_tokens": 1130116.0,
"step": 158
},
{
"epoch": 0.1111111111111111,
"grad_norm": 1.3721921552177048,
"learning_rate": 0.00019663152663678099,
"loss": 12.4606,
"num_tokens": 1137314.0,
"step": 159
},
{
"epoch": 0.11180992313067785,
"grad_norm": 1.5450593769920604,
"learning_rate": 0.0001965730281461299,
"loss": 12.8374,
"num_tokens": 1143586.0,
"step": 160
},
{
"epoch": 0.11250873515024458,
"grad_norm": 1.4770180074283248,
"learning_rate": 0.00019651403491610268,
"loss": 12.6782,
"num_tokens": 1150678.0,
"step": 161
},
{
"epoch": 0.11320754716981132,
"grad_norm": 1.445408495118887,
"learning_rate": 0.000196454547248919,
"loss": 12.645,
"num_tokens": 1158316.0,
"step": 162
},
{
"epoch": 0.11390635918937805,
"grad_norm": 1.5275106147817101,
"learning_rate": 0.00019639456544933155,
"loss": 12.7599,
"num_tokens": 1165236.0,
"step": 163
},
{
"epoch": 0.1146051712089448,
"grad_norm": 1.7109304556766742,
"learning_rate": 0.0001963340898246245,
"loss": 12.838,
"num_tokens": 1172589.0,
"step": 164
},
{
"epoch": 0.11530398322851153,
"grad_norm": 1.3543533906158267,
"learning_rate": 0.00019627312068461184,
"loss": 12.7582,
"num_tokens": 1179343.0,
"step": 165
},
{
"epoch": 0.11600279524807827,
"grad_norm": 1.4744050532881612,
"learning_rate": 0.00019621165834163572,
"loss": 12.6345,
"num_tokens": 1185779.0,
"step": 166
},
{
"epoch": 0.116701607267645,
"grad_norm": 1.5081447035442486,
"learning_rate": 0.00019614970311056503,
"loss": 12.9426,
"num_tokens": 1192364.0,
"step": 167
},
{
"epoch": 0.11740041928721175,
"grad_norm": 1.3892660490129107,
"learning_rate": 0.00019608725530879375,
"loss": 12.66,
"num_tokens": 1199385.0,
"step": 168
},
{
"epoch": 0.11809923130677848,
"grad_norm": 1.5001836923777718,
"learning_rate": 0.00019602431525623918,
"loss": 12.8446,
"num_tokens": 1206524.0,
"step": 169
},
{
"epoch": 0.1187980433263452,
"grad_norm": 1.4450185187090752,
"learning_rate": 0.00019596088327534047,
"loss": 12.5973,
"num_tokens": 1213487.0,
"step": 170
},
{
"epoch": 0.11949685534591195,
"grad_norm": 1.3844082514866582,
"learning_rate": 0.0001958969596910568,
"loss": 12.6159,
"num_tokens": 1220301.0,
"step": 171
},
{
"epoch": 0.12019566736547868,
"grad_norm": 1.3862156732184288,
"learning_rate": 0.000195832544830866,
"loss": 12.5869,
"num_tokens": 1227778.0,
"step": 172
},
{
"epoch": 0.12089447938504543,
"grad_norm": 1.5410063676466346,
"learning_rate": 0.00019576763902476242,
"loss": 12.6891,
"num_tokens": 1234261.0,
"step": 173
},
{
"epoch": 0.12159329140461216,
"grad_norm": 1.5794782038580049,
"learning_rate": 0.0001957022426052558,
"loss": 12.5885,
"num_tokens": 1241757.0,
"step": 174
},
{
"epoch": 0.1222921034241789,
"grad_norm": 1.4130269812346519,
"learning_rate": 0.00019563635590736901,
"loss": 12.5449,
"num_tokens": 1248424.0,
"step": 175
},
{
"epoch": 0.12299091544374563,
"grad_norm": 1.4804916312885361,
"learning_rate": 0.00019556997926863673,
"loss": 12.8005,
"num_tokens": 1255116.0,
"step": 176
},
{
"epoch": 0.12368972746331237,
"grad_norm": 1.391666293716997,
"learning_rate": 0.0001955031130291036,
"loss": 12.6645,
"num_tokens": 1262373.0,
"step": 177
},
{
"epoch": 0.1243885394828791,
"grad_norm": 1.462524566865328,
"learning_rate": 0.0001954357575313224,
"loss": 12.7713,
"num_tokens": 1268591.0,
"step": 178
},
{
"epoch": 0.12508735150244585,
"grad_norm": 1.3533516804568375,
"learning_rate": 0.0001953679131203524,
"loss": 12.6721,
"num_tokens": 1276309.0,
"step": 179
},
{
"epoch": 0.12578616352201258,
"grad_norm": 1.6846564652870581,
"learning_rate": 0.00019529958014375746,
"loss": 12.581,
"num_tokens": 1283604.0,
"step": 180
},
{
"epoch": 0.1264849755415793,
"grad_norm": 1.3037737767137891,
"learning_rate": 0.0001952307589516045,
"loss": 12.6895,
"num_tokens": 1290423.0,
"step": 181
},
{
"epoch": 0.12718378756114604,
"grad_norm": 1.4763749163183653,
"learning_rate": 0.00019516144989646143,
"loss": 12.7782,
"num_tokens": 1297162.0,
"step": 182
},
{
"epoch": 0.1278825995807128,
"grad_norm": 1.198895326351823,
"learning_rate": 0.00019509165333339551,
"loss": 12.577,
"num_tokens": 1304042.0,
"step": 183
},
{
"epoch": 0.12858141160027953,
"grad_norm": 1.4329209092636808,
"learning_rate": 0.0001950213696199714,
"loss": 12.4852,
"num_tokens": 1311266.0,
"step": 184
},
{
"epoch": 0.12928022361984626,
"grad_norm": 1.6220658819801768,
"learning_rate": 0.00019495059911624958,
"loss": 12.7953,
"num_tokens": 1317490.0,
"step": 185
},
{
"epoch": 0.129979035639413,
"grad_norm": 1.36259479273472,
"learning_rate": 0.00019487934218478413,
"loss": 12.6933,
"num_tokens": 1324708.0,
"step": 186
},
{
"epoch": 0.13067784765897975,
"grad_norm": 1.5346998304400725,
"learning_rate": 0.0001948075991906212,
"loss": 12.46,
"num_tokens": 1331506.0,
"step": 187
},
{
"epoch": 0.13137665967854648,
"grad_norm": 1.5977678492934153,
"learning_rate": 0.00019473537050129704,
"loss": 12.5766,
"num_tokens": 1338737.0,
"step": 188
},
{
"epoch": 0.1320754716981132,
"grad_norm": 1.4504648749412852,
"learning_rate": 0.00019466265648683602,
"loss": 12.6044,
"num_tokens": 1346238.0,
"step": 189
},
{
"epoch": 0.13277428371767994,
"grad_norm": 1.6038070232816393,
"learning_rate": 0.0001945894575197488,
"loss": 12.5935,
"num_tokens": 1353786.0,
"step": 190
},
{
"epoch": 0.1334730957372467,
"grad_norm": 1.516022452074653,
"learning_rate": 0.00019451577397503053,
"loss": 12.6887,
"num_tokens": 1360969.0,
"step": 191
},
{
"epoch": 0.13417190775681342,
"grad_norm": 1.475254652217503,
"learning_rate": 0.00019444160623015874,
"loss": 12.7507,
"num_tokens": 1368167.0,
"step": 192
},
{
"epoch": 0.13487071977638015,
"grad_norm": 1.6428399957055764,
"learning_rate": 0.00019436695466509152,
"loss": 12.4319,
"num_tokens": 1375092.0,
"step": 193
},
{
"epoch": 0.13556953179594688,
"grad_norm": 1.8224392036810861,
"learning_rate": 0.00019429181966226558,
"loss": 12.3294,
"num_tokens": 1383015.0,
"step": 194
},
{
"epoch": 0.13626834381551362,
"grad_norm": 1.4142248070635162,
"learning_rate": 0.00019421620160659417,
"loss": 12.4263,
"num_tokens": 1389785.0,
"step": 195
},
{
"epoch": 0.13696715583508037,
"grad_norm": 1.7732304272767438,
"learning_rate": 0.00019414010088546535,
"loss": 12.4284,
"num_tokens": 1397770.0,
"step": 196
},
{
"epoch": 0.1376659678546471,
"grad_norm": 1.2887513644847417,
"learning_rate": 0.00019406351788873972,
"loss": 12.3058,
"num_tokens": 1404674.0,
"step": 197
},
{
"epoch": 0.13836477987421383,
"grad_norm": 1.915692503409536,
"learning_rate": 0.00019398645300874865,
"loss": 12.7271,
"num_tokens": 1411618.0,
"step": 198
},
{
"epoch": 0.13906359189378056,
"grad_norm": 1.5922007813255112,
"learning_rate": 0.00019390890664029204,
"loss": 12.4225,
"num_tokens": 1418834.0,
"step": 199
},
{
"epoch": 0.13976240391334732,
"grad_norm": 1.562568647357345,
"learning_rate": 0.0001938308791806366,
"loss": 12.587,
"num_tokens": 1425877.0,
"step": 200
},
{
"epoch": 0.14046121593291405,
"grad_norm": 1.6484034424059983,
"learning_rate": 0.0001937523710295136,
"loss": 12.5672,
"num_tokens": 1432515.0,
"step": 201
},
{
"epoch": 0.14116002795248078,
"grad_norm": 1.3489309672054464,
"learning_rate": 0.00019367338258911675,
"loss": 12.514,
"num_tokens": 1439548.0,
"step": 202
},
{
"epoch": 0.1418588399720475,
"grad_norm": 1.8825352386410237,
"learning_rate": 0.0001935939142641004,
"loss": 12.4288,
"num_tokens": 1446322.0,
"step": 203
},
{
"epoch": 0.14255765199161424,
"grad_norm": 1.3919561230568087,
"learning_rate": 0.0001935139664615773,
"loss": 12.6324,
"num_tokens": 1453298.0,
"step": 204
},
{
"epoch": 0.143256464011181,
"grad_norm": 1.6509748790652776,
"learning_rate": 0.00019343353959111652,
"loss": 12.4141,
"num_tokens": 1460188.0,
"step": 205
},
{
"epoch": 0.14395527603074773,
"grad_norm": 1.6488562651674301,
"learning_rate": 0.00019335263406474137,
"loss": 12.4702,
"num_tokens": 1467199.0,
"step": 206
},
{
"epoch": 0.14465408805031446,
"grad_norm": 1.5210417677317358,
"learning_rate": 0.00019327125029692735,
"loss": 12.4063,
"num_tokens": 1474116.0,
"step": 207
},
{
"epoch": 0.1453529000698812,
"grad_norm": 1.4513815903689755,
"learning_rate": 0.00019318938870459984,
"loss": 12.4471,
"num_tokens": 1480988.0,
"step": 208
},
{
"epoch": 0.14605171208944795,
"grad_norm": 1.478959568743595,
"learning_rate": 0.00019310704970713224,
"loss": 12.2195,
"num_tokens": 1487900.0,
"step": 209
},
{
"epoch": 0.14675052410901468,
"grad_norm": 1.3540930180495643,
"learning_rate": 0.0001930242337263436,
"loss": 12.4247,
"num_tokens": 1495543.0,
"step": 210
},
{
"epoch": 0.1474493361285814,
"grad_norm": 1.5214245330484042,
"learning_rate": 0.00019294094118649653,
"loss": 12.23,
"num_tokens": 1502498.0,
"step": 211
},
{
"epoch": 0.14814814814814814,
"grad_norm": 1.4560111174480097,
"learning_rate": 0.00019285717251429506,
"loss": 12.2885,
"num_tokens": 1509580.0,
"step": 212
},
{
"epoch": 0.1488469601677149,
"grad_norm": 1.4716461734559856,
"learning_rate": 0.00019277292813888244,
"loss": 12.3907,
"num_tokens": 1516376.0,
"step": 213
},
{
"epoch": 0.14954577218728163,
"grad_norm": 1.375026775365828,
"learning_rate": 0.00019268820849183883,
"loss": 12.4456,
"num_tokens": 1523015.0,
"step": 214
},
{
"epoch": 0.15024458420684836,
"grad_norm": 1.5686453771772464,
"learning_rate": 0.00019260301400717938,
"loss": 12.577,
"num_tokens": 1530696.0,
"step": 215
},
{
"epoch": 0.1509433962264151,
"grad_norm": 1.3279067290386655,
"learning_rate": 0.00019251734512135157,
"loss": 12.7059,
"num_tokens": 1537893.0,
"step": 216
},
{
"epoch": 0.15164220824598182,
"grad_norm": 1.2564753041744463,
"learning_rate": 0.00019243120227323333,
"loss": 12.2507,
"num_tokens": 1545460.0,
"step": 217
},
{
"epoch": 0.15234102026554858,
"grad_norm": 1.2500188377338248,
"learning_rate": 0.00019234458590413077,
"loss": 12.2926,
"num_tokens": 1552764.0,
"step": 218
},
{
"epoch": 0.1530398322851153,
"grad_norm": 1.4480919319809458,
"learning_rate": 0.0001922574964577757,
"loss": 12.4254,
"num_tokens": 1559826.0,
"step": 219
},
{
"epoch": 0.15373864430468204,
"grad_norm": 1.4659507852094718,
"learning_rate": 0.0001921699343803235,
"loss": 12.5645,
"num_tokens": 1567575.0,
"step": 220
},
{
"epoch": 0.15443745632424877,
"grad_norm": 1.3643705799081125,
"learning_rate": 0.00019208190012035087,
"loss": 12.3877,
"num_tokens": 1574362.0,
"step": 221
},
{
"epoch": 0.15513626834381553,
"grad_norm": 2.007483521518634,
"learning_rate": 0.00019199339412885347,
"loss": 12.2945,
"num_tokens": 1581335.0,
"step": 222
},
{
"epoch": 0.15583508036338226,
"grad_norm": 1.3421403438430553,
"learning_rate": 0.00019190441685924353,
"loss": 12.2394,
"num_tokens": 1588536.0,
"step": 223
},
{
"epoch": 0.15653389238294899,
"grad_norm": 1.619215765976526,
"learning_rate": 0.00019181496876734776,
"loss": 12.4262,
"num_tokens": 1595480.0,
"step": 224
},
{
"epoch": 0.15723270440251572,
"grad_norm": 1.2307496781017198,
"learning_rate": 0.0001917250503114048,
"loss": 12.4242,
"num_tokens": 1602858.0,
"step": 225
},
{
"epoch": 0.15793151642208245,
"grad_norm": 1.3116673091039746,
"learning_rate": 0.0001916346619520629,
"loss": 12.2034,
"num_tokens": 1610558.0,
"step": 226
},
{
"epoch": 0.1586303284416492,
"grad_norm": 1.247482468822277,
"learning_rate": 0.00019154380415237768,
"loss": 12.5231,
"num_tokens": 1617490.0,
"step": 227
},
{
"epoch": 0.15932914046121593,
"grad_norm": 1.2513001860850739,
"learning_rate": 0.00019145247737780961,
"loss": 12.3687,
"num_tokens": 1624406.0,
"step": 228
},
{
"epoch": 0.16002795248078266,
"grad_norm": 1.158869133135163,
"learning_rate": 0.00019136068209622183,
"loss": 12.3431,
"num_tokens": 1631539.0,
"step": 229
},
{
"epoch": 0.1607267645003494,
"grad_norm": 1.2794317571864204,
"learning_rate": 0.00019126841877787745,
"loss": 12.2967,
"num_tokens": 1638417.0,
"step": 230
},
{
"epoch": 0.16142557651991615,
"grad_norm": 1.147970460946765,
"learning_rate": 0.00019117568789543742,
"loss": 12.2909,
"num_tokens": 1645769.0,
"step": 231
},
{
"epoch": 0.16212438853948288,
"grad_norm": 1.2532108422981187,
"learning_rate": 0.00019108248992395795,
"loss": 12.3622,
"num_tokens": 1653953.0,
"step": 232
},
{
"epoch": 0.1628232005590496,
"grad_norm": 1.0938253419136699,
"learning_rate": 0.0001909888253408882,
"loss": 12.1525,
"num_tokens": 1661632.0,
"step": 233
},
{
"epoch": 0.16352201257861634,
"grad_norm": 1.2405546810649237,
"learning_rate": 0.00019089469462606765,
"loss": 12.2075,
"num_tokens": 1668594.0,
"step": 234
},
{
"epoch": 0.1642208245981831,
"grad_norm": 1.197216633614794,
"learning_rate": 0.00019080009826172387,
"loss": 12.257,
"num_tokens": 1675137.0,
"step": 235
},
{
"epoch": 0.16491963661774983,
"grad_norm": 1.309655057462355,
"learning_rate": 0.00019070503673246982,
"loss": 12.238,
"num_tokens": 1682132.0,
"step": 236
},
{
"epoch": 0.16561844863731656,
"grad_norm": 1.244523044571817,
"learning_rate": 0.0001906095105253016,
"loss": 12.5012,
"num_tokens": 1688115.0,
"step": 237
},
{
"epoch": 0.1663172606568833,
"grad_norm": 1.3805924558908977,
"learning_rate": 0.00019051352012959568,
"loss": 12.3363,
"num_tokens": 1694968.0,
"step": 238
},
{
"epoch": 0.16701607267645002,
"grad_norm": 1.3621136578810145,
"learning_rate": 0.0001904170660371067,
"loss": 12.5149,
"num_tokens": 1702568.0,
"step": 239
},
{
"epoch": 0.16771488469601678,
"grad_norm": 1.388178453568404,
"learning_rate": 0.00019032014874196474,
"loss": 12.4916,
"num_tokens": 1709560.0,
"step": 240
},
{
"epoch": 0.1684136967155835,
"grad_norm": 1.1942869510376484,
"learning_rate": 0.0001902227687406728,
"loss": 12.337,
"num_tokens": 1717750.0,
"step": 241
},
{
"epoch": 0.16911250873515024,
"grad_norm": 1.4017872903001107,
"learning_rate": 0.0001901249265321044,
"loss": 12.2873,
"num_tokens": 1724736.0,
"step": 242
},
{
"epoch": 0.16981132075471697,
"grad_norm": 1.2742005252701514,
"learning_rate": 0.00019002662261750078,
"loss": 12.2834,
"num_tokens": 1732240.0,
"step": 243
},
{
"epoch": 0.17051013277428373,
"grad_norm": 1.354069951288616,
"learning_rate": 0.00018992785750046863,
"loss": 12.1109,
"num_tokens": 1739543.0,
"step": 244
},
{
"epoch": 0.17120894479385046,
"grad_norm": 1.2292579561482964,
"learning_rate": 0.00018982863168697734,
"loss": 12.357,
"num_tokens": 1746459.0,
"step": 245
},
{
"epoch": 0.1719077568134172,
"grad_norm": 1.3113989556044794,
"learning_rate": 0.00018972894568535634,
"loss": 12.4115,
"num_tokens": 1753478.0,
"step": 246
},
{
"epoch": 0.17260656883298392,
"grad_norm": 1.2595357641289937,
"learning_rate": 0.00018962880000629258,
"loss": 12.0726,
"num_tokens": 1760374.0,
"step": 247
},
{
"epoch": 0.17330538085255065,
"grad_norm": 1.2704656650350132,
"learning_rate": 0.0001895281951628281,
"loss": 12.4336,
"num_tokens": 1767809.0,
"step": 248
},
{
"epoch": 0.1740041928721174,
"grad_norm": 1.244384651549712,
"learning_rate": 0.000189427131670357,
"loss": 12.4644,
"num_tokens": 1775284.0,
"step": 249
},
{
"epoch": 0.17470300489168414,
"grad_norm": 1.2916600144904333,
"learning_rate": 0.00018932561004662312,
"loss": 12.1551,
"num_tokens": 1782896.0,
"step": 250
},
{
"epoch": 0.17540181691125087,
"grad_norm": 1.323531415132343,
"learning_rate": 0.00018922363081171723,
"loss": 12.238,
"num_tokens": 1790348.0,
"step": 251
},
{
"epoch": 0.1761006289308176,
"grad_norm": 1.2454277559117337,
"learning_rate": 0.0001891211944880746,
"loss": 12.6069,
"num_tokens": 1796663.0,
"step": 252
},
{
"epoch": 0.17679944095038436,
"grad_norm": 1.155499185809833,
"learning_rate": 0.00018901830160047184,
"loss": 12.1616,
"num_tokens": 1804575.0,
"step": 253
},
{
"epoch": 0.1774982529699511,
"grad_norm": 1.49853696436634,
"learning_rate": 0.0001889149526760248,
"loss": 12.3159,
"num_tokens": 1810818.0,
"step": 254
},
{
"epoch": 0.17819706498951782,
"grad_norm": 1.2735723628571658,
"learning_rate": 0.0001888111482441855,
"loss": 12.1924,
"num_tokens": 1817813.0,
"step": 255
},
{
"epoch": 0.17889587700908455,
"grad_norm": 1.3127755468927709,
"learning_rate": 0.00018870688883673936,
"loss": 12.2746,
"num_tokens": 1824365.0,
"step": 256
},
{
"epoch": 0.1795946890286513,
"grad_norm": 1.4197695569247741,
"learning_rate": 0.00018860217498780285,
"loss": 12.3002,
"num_tokens": 1831336.0,
"step": 257
},
{
"epoch": 0.18029350104821804,
"grad_norm": 1.461090176993007,
"learning_rate": 0.00018849700723382035,
"loss": 12.1504,
"num_tokens": 1838657.0,
"step": 258
},
{
"epoch": 0.18099231306778477,
"grad_norm": 1.3059767534305093,
"learning_rate": 0.0001883913861135617,
"loss": 12.2029,
"num_tokens": 1845965.0,
"step": 259
},
{
"epoch": 0.1816911250873515,
"grad_norm": 1.4172408521615742,
"learning_rate": 0.00018828531216811913,
"loss": 12.479,
"num_tokens": 1852638.0,
"step": 260
},
{
"epoch": 0.18238993710691823,
"grad_norm": 1.4950969515194465,
"learning_rate": 0.00018817878594090494,
"loss": 12.3214,
"num_tokens": 1859877.0,
"step": 261
},
{
"epoch": 0.18308874912648498,
"grad_norm": 1.921065371477902,
"learning_rate": 0.00018807180797764822,
"loss": 12.2991,
"num_tokens": 1866923.0,
"step": 262
},
{
"epoch": 0.18378756114605171,
"grad_norm": 1.5413785930712542,
"learning_rate": 0.00018796437882639242,
"loss": 12.1631,
"num_tokens": 1873292.0,
"step": 263
},
{
"epoch": 0.18448637316561844,
"grad_norm": 2.058759919250719,
"learning_rate": 0.00018785649903749234,
"loss": 12.2534,
"num_tokens": 1879744.0,
"step": 264
},
{
"epoch": 0.18518518518518517,
"grad_norm": 1.3730183956055928,
"learning_rate": 0.00018774816916361137,
"loss": 12.2737,
"num_tokens": 1886064.0,
"step": 265
},
{
"epoch": 0.18588399720475193,
"grad_norm": 1.771295055388875,
"learning_rate": 0.00018763938975971872,
"loss": 12.0608,
"num_tokens": 1893813.0,
"step": 266
},
{
"epoch": 0.18658280922431866,
"grad_norm": 1.325037236296118,
"learning_rate": 0.0001875301613830865,
"loss": 12.6401,
"num_tokens": 1901211.0,
"step": 267
},
{
"epoch": 0.1872816212438854,
"grad_norm": 1.650441961527195,
"learning_rate": 0.00018742048459328682,
"loss": 12.253,
"num_tokens": 1907987.0,
"step": 268
},
{
"epoch": 0.18798043326345212,
"grad_norm": 1.3109663633913111,
"learning_rate": 0.00018731035995218914,
"loss": 12.4699,
"num_tokens": 1915853.0,
"step": 269
},
{
"epoch": 0.18867924528301888,
"grad_norm": 1.6175242871144049,
"learning_rate": 0.00018719978802395705,
"loss": 12.0339,
"num_tokens": 1923310.0,
"step": 270
},
{
"epoch": 0.1893780573025856,
"grad_norm": 1.5608096103087008,
"learning_rate": 0.0001870887693750458,
"loss": 12.0746,
"num_tokens": 1930169.0,
"step": 271
},
{
"epoch": 0.19007686932215234,
"grad_norm": 1.301318723039762,
"learning_rate": 0.00018697730457419893,
"loss": 12.2536,
"num_tokens": 1937617.0,
"step": 272
},
{
"epoch": 0.19077568134171907,
"grad_norm": 1.3298561548013452,
"learning_rate": 0.00018686539419244578,
"loss": 12.0688,
"num_tokens": 1944358.0,
"step": 273
},
{
"epoch": 0.1914744933612858,
"grad_norm": 1.3519869211393252,
"learning_rate": 0.0001867530388030983,
"loss": 12.1702,
"num_tokens": 1951629.0,
"step": 274
},
{
"epoch": 0.19217330538085256,
"grad_norm": 1.3057581412928303,
"learning_rate": 0.00018664023898174817,
"loss": 12.1388,
"num_tokens": 1958779.0,
"step": 275
},
{
"epoch": 0.1928721174004193,
"grad_norm": 1.4643939657595633,
"learning_rate": 0.00018652699530626398,
"loss": 12.0666,
"num_tokens": 1966253.0,
"step": 276
},
{
"epoch": 0.19357092941998602,
"grad_norm": 1.220622846026274,
"learning_rate": 0.00018641330835678804,
"loss": 12.2549,
"num_tokens": 1973038.0,
"step": 277
},
{
"epoch": 0.19426974143955275,
"grad_norm": 1.271130886700587,
"learning_rate": 0.00018629917871573366,
"loss": 12.3878,
"num_tokens": 1980735.0,
"step": 278
},
{
"epoch": 0.1949685534591195,
"grad_norm": 1.2558395545718568,
"learning_rate": 0.0001861846069677819,
"loss": 12.1311,
"num_tokens": 1988250.0,
"step": 279
},
{
"epoch": 0.19566736547868624,
"grad_norm": 1.286119199563044,
"learning_rate": 0.00018606959369987883,
"loss": 12.208,
"num_tokens": 1995184.0,
"step": 280
},
{
"epoch": 0.19636617749825297,
"grad_norm": 1.2007396708232814,
"learning_rate": 0.00018595413950123235,
"loss": 12.0446,
"num_tokens": 2001901.0,
"step": 281
},
{
"epoch": 0.1970649895178197,
"grad_norm": 1.2677570769488615,
"learning_rate": 0.00018583824496330923,
"loss": 12.3823,
"num_tokens": 2009242.0,
"step": 282
},
{
"epoch": 0.19776380153738643,
"grad_norm": 1.2317309257123488,
"learning_rate": 0.00018572191067983216,
"loss": 12.3145,
"num_tokens": 2016167.0,
"step": 283
},
{
"epoch": 0.1984626135569532,
"grad_norm": 1.4383245513671332,
"learning_rate": 0.00018560513724677643,
"loss": 12.2043,
"num_tokens": 2023059.0,
"step": 284
},
{
"epoch": 0.19916142557651992,
"grad_norm": 1.3973942304081446,
"learning_rate": 0.00018548792526236732,
"loss": 12.0703,
"num_tokens": 2030297.0,
"step": 285
},
{
"epoch": 0.19986023759608665,
"grad_norm": 1.6715503163582084,
"learning_rate": 0.00018537027532707662,
"loss": 12.2566,
"num_tokens": 2036674.0,
"step": 286
},
{
"epoch": 0.20055904961565338,
"grad_norm": 1.5807473618196455,
"learning_rate": 0.00018525218804361977,
"loss": 11.9821,
"num_tokens": 2043766.0,
"step": 287
},
{
"epoch": 0.20125786163522014,
"grad_norm": 1.3939701688797845,
"learning_rate": 0.00018513366401695276,
"loss": 12.2849,
"num_tokens": 2051302.0,
"step": 288
},
{
"epoch": 0.20195667365478687,
"grad_norm": 1.66288008309786,
"learning_rate": 0.00018501470385426892,
"loss": 12.1162,
"num_tokens": 2058562.0,
"step": 289
},
{
"epoch": 0.2026554856743536,
"grad_norm": 1.205066449864526,
"learning_rate": 0.00018489530816499596,
"loss": 12.1756,
"num_tokens": 2065605.0,
"step": 290
},
{
"epoch": 0.20335429769392033,
"grad_norm": 1.4415331634901984,
"learning_rate": 0.00018477547756079276,
"loss": 12.1293,
"num_tokens": 2072590.0,
"step": 291
},
{
"epoch": 0.20405310971348709,
"grad_norm": 1.2758727814346498,
"learning_rate": 0.0001846552126555462,
"loss": 12.2186,
"num_tokens": 2080039.0,
"step": 292
},
{
"epoch": 0.20475192173305382,
"grad_norm": 1.365806213254789,
"learning_rate": 0.00018453451406536816,
"loss": 12.3626,
"num_tokens": 2086481.0,
"step": 293
},
{
"epoch": 0.20545073375262055,
"grad_norm": 1.258317016689637,
"learning_rate": 0.00018441338240859215,
"loss": 12.2221,
"num_tokens": 2093192.0,
"step": 294
},
{
"epoch": 0.20614954577218728,
"grad_norm": 1.2394183274844288,
"learning_rate": 0.00018429181830577034,
"loss": 12.1013,
"num_tokens": 2100572.0,
"step": 295
},
{
"epoch": 0.206848357791754,
"grad_norm": 1.5003520290968377,
"learning_rate": 0.00018416982237967028,
"loss": 12.2991,
"num_tokens": 2107911.0,
"step": 296
},
{
"epoch": 0.20754716981132076,
"grad_norm": 1.3085255308847732,
"learning_rate": 0.00018404739525527174,
"loss": 12.0833,
"num_tokens": 2115264.0,
"step": 297
},
{
"epoch": 0.2082459818308875,
"grad_norm": 1.6056696953344411,
"learning_rate": 0.0001839245375597635,
"loss": 12.0227,
"num_tokens": 2122114.0,
"step": 298
},
{
"epoch": 0.20894479385045422,
"grad_norm": 1.306758409586964,
"learning_rate": 0.0001838012499225401,
"loss": 12.0232,
"num_tokens": 2129186.0,
"step": 299
},
{
"epoch": 0.20964360587002095,
"grad_norm": 1.5183083261581376,
"learning_rate": 0.00018367753297519873,
"loss": 12.3017,
"num_tokens": 2136056.0,
"step": 300
},
{
"epoch": 0.2103424178895877,
"grad_norm": 1.3471382535225356,
"learning_rate": 0.00018355338735153587,
"loss": 12.0467,
"num_tokens": 2143135.0,
"step": 301
},
{
"epoch": 0.21104122990915444,
"grad_norm": 1.2459910456351586,
"learning_rate": 0.00018342881368754404,
"loss": 12.2002,
"num_tokens": 2149855.0,
"step": 302
},
{
"epoch": 0.21174004192872117,
"grad_norm": 1.411018357188159,
"learning_rate": 0.00018330381262140864,
"loss": 12.4267,
"num_tokens": 2156629.0,
"step": 303
},
{
"epoch": 0.2124388539482879,
"grad_norm": 1.231215569953917,
"learning_rate": 0.00018317838479350472,
"loss": 12.3044,
"num_tokens": 2163993.0,
"step": 304
},
{
"epoch": 0.21313766596785463,
"grad_norm": 1.282732106655346,
"learning_rate": 0.0001830525308463934,
"loss": 12.1661,
"num_tokens": 2170517.0,
"step": 305
},
{
"epoch": 0.2138364779874214,
"grad_norm": 1.3735651142447436,
"learning_rate": 0.00018292625142481906,
"loss": 12.2508,
"num_tokens": 2177805.0,
"step": 306
},
{
"epoch": 0.21453529000698812,
"grad_norm": 1.1267279398187773,
"learning_rate": 0.00018279954717570553,
"loss": 12.246,
"num_tokens": 2184824.0,
"step": 307
},
{
"epoch": 0.21523410202655485,
"grad_norm": 1.297904898358193,
"learning_rate": 0.00018267241874815314,
"loss": 11.951,
"num_tokens": 2192640.0,
"step": 308
},
{
"epoch": 0.21593291404612158,
"grad_norm": 1.168111311692671,
"learning_rate": 0.00018254486679343516,
"loss": 12.1526,
"num_tokens": 2199963.0,
"step": 309
},
{
"epoch": 0.21663172606568834,
"grad_norm": 1.2350993727401407,
"learning_rate": 0.00018241689196499475,
"loss": 12.1094,
"num_tokens": 2207388.0,
"step": 310
},
{
"epoch": 0.21733053808525507,
"grad_norm": 1.2380916588446629,
"learning_rate": 0.00018228849491844129,
"loss": 11.9739,
"num_tokens": 2214115.0,
"step": 311
},
{
"epoch": 0.2180293501048218,
"grad_norm": 1.2229988857994327,
"learning_rate": 0.00018215967631154717,
"loss": 12.0549,
"num_tokens": 2221801.0,
"step": 312
},
{
"epoch": 0.21872816212438853,
"grad_norm": 1.4655555313500719,
"learning_rate": 0.00018203043680424448,
"loss": 12.127,
"num_tokens": 2229449.0,
"step": 313
},
{
"epoch": 0.2194269741439553,
"grad_norm": 1.2682868579019313,
"learning_rate": 0.00018190077705862155,
"loss": 12.2111,
"num_tokens": 2236249.0,
"step": 314
},
{
"epoch": 0.22012578616352202,
"grad_norm": 1.6151789147467688,
"learning_rate": 0.00018177069773891953,
"loss": 12.0597,
"num_tokens": 2243354.0,
"step": 315
},
{
"epoch": 0.22082459818308875,
"grad_norm": 1.3508337349304373,
"learning_rate": 0.00018164019951152902,
"loss": 12.1405,
"num_tokens": 2249837.0,
"step": 316
},
{
"epoch": 0.22152341020265548,
"grad_norm": 1.760603025999751,
"learning_rate": 0.00018150928304498675,
"loss": 12.0609,
"num_tokens": 2256520.0,
"step": 317
},
{
"epoch": 0.2222222222222222,
"grad_norm": 1.305934795669062,
"learning_rate": 0.00018137794900997201,
"loss": 12.2233,
"num_tokens": 2263145.0,
"step": 318
},
{
"epoch": 0.22292103424178897,
"grad_norm": 1.491378308966807,
"learning_rate": 0.0001812461980793033,
"loss": 12.1678,
"num_tokens": 2269862.0,
"step": 319
},
{
"epoch": 0.2236198462613557,
"grad_norm": 1.2632967458884938,
"learning_rate": 0.0001811140309279348,
"loss": 12.3329,
"num_tokens": 2276820.0,
"step": 320
},
{
"epoch": 0.22431865828092243,
"grad_norm": 1.1158838193265348,
"learning_rate": 0.00018098144823295304,
"loss": 11.9781,
"num_tokens": 2284748.0,
"step": 321
},
{
"epoch": 0.22501747030048916,
"grad_norm": 1.2505380919755098,
"learning_rate": 0.00018084845067357336,
"loss": 12.0788,
"num_tokens": 2292140.0,
"step": 322
},
{
"epoch": 0.22571628232005592,
"grad_norm": 1.0921267435771704,
"learning_rate": 0.00018071503893113638,
"loss": 12.2769,
"num_tokens": 2300108.0,
"step": 323
},
{
"epoch": 0.22641509433962265,
"grad_norm": 1.3414524658246134,
"learning_rate": 0.00018058121368910458,
"loss": 11.8736,
"num_tokens": 2307811.0,
"step": 324
},
{
"epoch": 0.22711390635918938,
"grad_norm": 1.3336139939898337,
"learning_rate": 0.00018044697563305876,
"loss": 11.9168,
"num_tokens": 2315254.0,
"step": 325
},
{
"epoch": 0.2278127183787561,
"grad_norm": 1.2424706438980715,
"learning_rate": 0.00018031232545069468,
"loss": 12.1282,
"num_tokens": 2323116.0,
"step": 326
},
{
"epoch": 0.22851153039832284,
"grad_norm": 1.189509985734354,
"learning_rate": 0.00018017726383181925,
"loss": 12.1013,
"num_tokens": 2330812.0,
"step": 327
},
{
"epoch": 0.2292103424178896,
"grad_norm": 1.201526626317861,
"learning_rate": 0.0001800417914683471,
"loss": 12.1022,
"num_tokens": 2338451.0,
"step": 328
},
{
"epoch": 0.22990915443745633,
"grad_norm": 1.0841395533719858,
"learning_rate": 0.0001799059090542974,
"loss": 12.1923,
"num_tokens": 2346026.0,
"step": 329
},
{
"epoch": 0.23060796645702306,
"grad_norm": 1.0543640932645288,
"learning_rate": 0.00017976961728578963,
"loss": 12.0118,
"num_tokens": 2353605.0,
"step": 330
},
{
"epoch": 0.23130677847658979,
"grad_norm": 1.0221140744809603,
"learning_rate": 0.00017963291686104053,
"loss": 12.0598,
"num_tokens": 2360509.0,
"step": 331
},
{
"epoch": 0.23200559049615654,
"grad_norm": 1.1077266853542385,
"learning_rate": 0.00017949580848036046,
"loss": 12.2089,
"num_tokens": 2367035.0,
"step": 332
},
{
"epoch": 0.23270440251572327,
"grad_norm": 1.1017231412164032,
"learning_rate": 0.00017935829284614952,
"loss": 12.0369,
"num_tokens": 2373702.0,
"step": 333
},
{
"epoch": 0.23340321453529,
"grad_norm": 1.156329815157838,
"learning_rate": 0.00017922037066289432,
"loss": 12.1458,
"num_tokens": 2380174.0,
"step": 334
},
{
"epoch": 0.23410202655485673,
"grad_norm": 1.1589595989908332,
"learning_rate": 0.0001790820426371641,
"loss": 12.0406,
"num_tokens": 2387052.0,
"step": 335
},
{
"epoch": 0.2348008385744235,
"grad_norm": 1.0708490484699373,
"learning_rate": 0.00017894330947760726,
"loss": 11.8914,
"num_tokens": 2393866.0,
"step": 336
},
{
"epoch": 0.23549965059399022,
"grad_norm": 1.2056352526942165,
"learning_rate": 0.0001788041718949477,
"loss": 12.0097,
"num_tokens": 2401040.0,
"step": 337
},
{
"epoch": 0.23619846261355695,
"grad_norm": 1.034857406947483,
"learning_rate": 0.00017866463060198115,
"loss": 12.2169,
"num_tokens": 2408627.0,
"step": 338
},
{
"epoch": 0.23689727463312368,
"grad_norm": 1.1237746507948383,
"learning_rate": 0.00017852468631357146,
"loss": 12.1159,
"num_tokens": 2415390.0,
"step": 339
},
{
"epoch": 0.2375960866526904,
"grad_norm": 1.0771012425887634,
"learning_rate": 0.00017838433974664712,
"loss": 11.9807,
"num_tokens": 2422275.0,
"step": 340
},
{
"epoch": 0.23829489867225717,
"grad_norm": 1.0775272381287269,
"learning_rate": 0.00017824359162019738,
"loss": 12.039,
"num_tokens": 2429408.0,
"step": 341
},
{
"epoch": 0.2389937106918239,
"grad_norm": 1.157562166665061,
"learning_rate": 0.00017810244265526875,
"loss": 12.0734,
"num_tokens": 2436362.0,
"step": 342
},
{
"epoch": 0.23969252271139063,
"grad_norm": 1.141745416472308,
"learning_rate": 0.00017796089357496108,
"loss": 12.1806,
"num_tokens": 2442668.0,
"step": 343
},
{
"epoch": 0.24039133473095736,
"grad_norm": 1.1565392135388017,
"learning_rate": 0.0001778189451044242,
"loss": 12.1268,
"num_tokens": 2449013.0,
"step": 344
},
{
"epoch": 0.24109014675052412,
"grad_norm": 1.2199413936237042,
"learning_rate": 0.00017767659797085375,
"loss": 12.0651,
"num_tokens": 2455131.0,
"step": 345
},
{
"epoch": 0.24178895877009085,
"grad_norm": 1.1260507638707287,
"learning_rate": 0.000177533852903488,
"loss": 12.1853,
"num_tokens": 2462060.0,
"step": 346
},
{
"epoch": 0.24248777078965758,
"grad_norm": 1.0316001161122876,
"learning_rate": 0.0001773907106336035,
"loss": 12.1543,
"num_tokens": 2469590.0,
"step": 347
},
{
"epoch": 0.2431865828092243,
"grad_norm": 1.0821146376156172,
"learning_rate": 0.0001772471718945119,
"loss": 11.9286,
"num_tokens": 2476488.0,
"step": 348
},
{
"epoch": 0.24388539482879107,
"grad_norm": 1.0314206907032888,
"learning_rate": 0.0001771032374215558,
"loss": 11.9985,
"num_tokens": 2484657.0,
"step": 349
},
{
"epoch": 0.2445842068483578,
"grad_norm": 1.1661975155295445,
"learning_rate": 0.00017695890795210517,
"loss": 12.2751,
"num_tokens": 2491489.0,
"step": 350
},
{
"epoch": 0.24528301886792453,
"grad_norm": 1.006621569495116,
"learning_rate": 0.00017681418422555356,
"loss": 12.1682,
"num_tokens": 2498631.0,
"step": 351
},
{
"epoch": 0.24598183088749126,
"grad_norm": 1.017213858202063,
"learning_rate": 0.00017666906698331428,
"loss": 11.8183,
"num_tokens": 2506017.0,
"step": 352
},
{
"epoch": 0.246680642907058,
"grad_norm": 1.1601560219424385,
"learning_rate": 0.00017652355696881652,
"loss": 12.0538,
"num_tokens": 2513168.0,
"step": 353
},
{
"epoch": 0.24737945492662475,
"grad_norm": 1.1274194045715678,
"learning_rate": 0.0001763776549275017,
"loss": 12.0499,
"num_tokens": 2520301.0,
"step": 354
},
{
"epoch": 0.24807826694619148,
"grad_norm": 1.3076210428356354,
"learning_rate": 0.00017623136160681963,
"loss": 12.0843,
"num_tokens": 2527172.0,
"step": 355
},
{
"epoch": 0.2487770789657582,
"grad_norm": 1.129509433616736,
"learning_rate": 0.00017608467775622445,
"loss": 12.1999,
"num_tokens": 2534008.0,
"step": 356
},
{
"epoch": 0.24947589098532494,
"grad_norm": 1.1713262910481799,
"learning_rate": 0.00017593760412717117,
"loss": 12.1523,
"num_tokens": 2541533.0,
"step": 357
},
{
"epoch": 0.2501747030048917,
"grad_norm": 1.26537646556665,
"learning_rate": 0.0001757901414731115,
"loss": 12.2059,
"num_tokens": 2548063.0,
"step": 358
},
{
"epoch": 0.2508735150244584,
"grad_norm": 1.0858488934936994,
"learning_rate": 0.00017564229054949006,
"loss": 11.9977,
"num_tokens": 2555500.0,
"step": 359
},
{
"epoch": 0.25157232704402516,
"grad_norm": 1.2437942191749563,
"learning_rate": 0.0001754940521137407,
"loss": 12.0343,
"num_tokens": 2562431.0,
"step": 360
},
{
"epoch": 0.2522711390635919,
"grad_norm": 1.1822227089400206,
"learning_rate": 0.0001753454269252824,
"loss": 12.1325,
"num_tokens": 2569063.0,
"step": 361
},
{
"epoch": 0.2529699510831586,
"grad_norm": 1.224962109373223,
"learning_rate": 0.00017519641574551546,
"loss": 12.2002,
"num_tokens": 2576257.0,
"step": 362
},
{
"epoch": 0.25366876310272535,
"grad_norm": 1.205685504194018,
"learning_rate": 0.0001750470193378176,
"loss": 12.1142,
"num_tokens": 2582076.0,
"step": 363
},
{
"epoch": 0.2543675751222921,
"grad_norm": 1.2821801879030204,
"learning_rate": 0.00017489723846754002,
"loss": 12.2827,
"num_tokens": 2588634.0,
"step": 364
},
{
"epoch": 0.25506638714185886,
"grad_norm": 1.1426775318667641,
"learning_rate": 0.0001747470739020036,
"loss": 11.867,
"num_tokens": 2595988.0,
"step": 365
},
{
"epoch": 0.2557651991614256,
"grad_norm": 1.1637204489941761,
"learning_rate": 0.00017459652641049474,
"loss": 12.1299,
"num_tokens": 2602882.0,
"step": 366
},
{
"epoch": 0.2564640111809923,
"grad_norm": 1.1749766201575815,
"learning_rate": 0.0001744455967642616,
"loss": 12.0572,
"num_tokens": 2610428.0,
"step": 367
},
{
"epoch": 0.25716282320055905,
"grad_norm": 1.161910310989543,
"learning_rate": 0.00017429428573651024,
"loss": 11.7402,
"num_tokens": 2617338.0,
"step": 368
},
{
"epoch": 0.2578616352201258,
"grad_norm": 1.1754220027439937,
"learning_rate": 0.00017414259410240026,
"loss": 11.9926,
"num_tokens": 2624172.0,
"step": 369
},
{
"epoch": 0.2585604472396925,
"grad_norm": 1.2221271919493757,
"learning_rate": 0.0001739905226390413,
"loss": 11.9574,
"num_tokens": 2631185.0,
"step": 370
},
{
"epoch": 0.25925925925925924,
"grad_norm": 1.121957071755484,
"learning_rate": 0.0001738380721254888,
"loss": 12.0634,
"num_tokens": 2638925.0,
"step": 371
},
{
"epoch": 0.259958071278826,
"grad_norm": 1.0439041189495724,
"learning_rate": 0.00017368524334273998,
"loss": 12.0201,
"num_tokens": 2645812.0,
"step": 372
},
{
"epoch": 0.2606568832983927,
"grad_norm": 1.1527899155597479,
"learning_rate": 0.00017353203707373,
"loss": 12.2003,
"num_tokens": 2652935.0,
"step": 373
},
{
"epoch": 0.2613556953179595,
"grad_norm": 1.1907694393429749,
"learning_rate": 0.00017337845410332782,
"loss": 12.0194,
"num_tokens": 2659882.0,
"step": 374
},
{
"epoch": 0.2620545073375262,
"grad_norm": 1.1925500897396009,
"learning_rate": 0.0001732244952183323,
"loss": 11.985,
"num_tokens": 2667013.0,
"step": 375
},
{
"epoch": 0.26275331935709295,
"grad_norm": 1.1065412669686558,
"learning_rate": 0.000173070161207468,
"loss": 11.9913,
"num_tokens": 2675131.0,
"step": 376
},
{
"epoch": 0.2634521313766597,
"grad_norm": 1.1800802673651714,
"learning_rate": 0.00017291545286138126,
"loss": 12.0599,
"num_tokens": 2681743.0,
"step": 377
},
{
"epoch": 0.2641509433962264,
"grad_norm": 1.1658749096956924,
"learning_rate": 0.00017276037097263612,
"loss": 12.0414,
"num_tokens": 2688355.0,
"step": 378
},
{
"epoch": 0.26484975541579314,
"grad_norm": 1.3106109156886867,
"learning_rate": 0.00017260491633571033,
"loss": 11.9744,
"num_tokens": 2695315.0,
"step": 379
},
{
"epoch": 0.2655485674353599,
"grad_norm": 1.025449480192329,
"learning_rate": 0.0001724490897469911,
"loss": 11.9174,
"num_tokens": 2703258.0,
"step": 380
},
{
"epoch": 0.2662473794549266,
"grad_norm": 1.2314285772644886,
"learning_rate": 0.00017229289200477123,
"loss": 11.9577,
"num_tokens": 2710326.0,
"step": 381
},
{
"epoch": 0.2669461914744934,
"grad_norm": 1.117575231956547,
"learning_rate": 0.00017213632390924486,
"loss": 12.0226,
"num_tokens": 2716825.0,
"step": 382
},
{
"epoch": 0.2676450034940601,
"grad_norm": 1.2030793581305907,
"learning_rate": 0.00017197938626250348,
"loss": 12.0668,
"num_tokens": 2723868.0,
"step": 383
},
{
"epoch": 0.26834381551362685,
"grad_norm": 1.1807906081601707,
"learning_rate": 0.00017182207986853176,
"loss": 12.1037,
"num_tokens": 2730711.0,
"step": 384
},
{
"epoch": 0.2690426275331936,
"grad_norm": 1.1626829031353831,
"learning_rate": 0.00017166440553320337,
"loss": 11.8767,
"num_tokens": 2737540.0,
"step": 385
},
{
"epoch": 0.2697414395527603,
"grad_norm": 1.1958111650969308,
"learning_rate": 0.0001715063640642771,
"loss": 11.9674,
"num_tokens": 2744595.0,
"step": 386
},
{
"epoch": 0.27044025157232704,
"grad_norm": 1.0938579012100453,
"learning_rate": 0.00017134795627139236,
"loss": 12.0369,
"num_tokens": 2751663.0,
"step": 387
},
{
"epoch": 0.27113906359189377,
"grad_norm": 1.1147811767994924,
"learning_rate": 0.00017118918296606537,
"loss": 11.9541,
"num_tokens": 2759081.0,
"step": 388
},
{
"epoch": 0.2718378756114605,
"grad_norm": 1.0946017427329167,
"learning_rate": 0.00017103004496168473,
"loss": 11.9995,
"num_tokens": 2766249.0,
"step": 389
},
{
"epoch": 0.27253668763102723,
"grad_norm": 1.1630648910235608,
"learning_rate": 0.0001708705430735075,
"loss": 12.1063,
"num_tokens": 2773320.0,
"step": 390
},
{
"epoch": 0.273235499650594,
"grad_norm": 0.9921673795864713,
"learning_rate": 0.00017071067811865476,
"loss": 12.0095,
"num_tokens": 2780968.0,
"step": 391
},
{
"epoch": 0.27393431167016075,
"grad_norm": 1.1524595718563635,
"learning_rate": 0.0001705504509161077,
"loss": 11.9414,
"num_tokens": 2788319.0,
"step": 392
},
{
"epoch": 0.2746331236897275,
"grad_norm": 1.0585603726235264,
"learning_rate": 0.00017038986228670323,
"loss": 12.0465,
"num_tokens": 2796120.0,
"step": 393
},
{
"epoch": 0.2753319357092942,
"grad_norm": 1.0331878827292054,
"learning_rate": 0.00017022891305312987,
"loss": 11.8823,
"num_tokens": 2803198.0,
"step": 394
},
{
"epoch": 0.27603074772886094,
"grad_norm": 1.1409979908427963,
"learning_rate": 0.00017006760403992337,
"loss": 11.9414,
"num_tokens": 2809978.0,
"step": 395
},
{
"epoch": 0.27672955974842767,
"grad_norm": 1.040919985039819,
"learning_rate": 0.00016990593607346276,
"loss": 11.9296,
"num_tokens": 2816932.0,
"step": 396
},
{
"epoch": 0.2774283717679944,
"grad_norm": 1.0716219841086227,
"learning_rate": 0.00016974390998196595,
"loss": 11.8656,
"num_tokens": 2824857.0,
"step": 397
},
{
"epoch": 0.2781271837875611,
"grad_norm": 0.9753445323621813,
"learning_rate": 0.00016958152659548548,
"loss": 11.8725,
"num_tokens": 2832029.0,
"step": 398
},
{
"epoch": 0.27882599580712786,
"grad_norm": 1.127571712658559,
"learning_rate": 0.00016941878674590425,
"loss": 11.9015,
"num_tokens": 2839252.0,
"step": 399
},
{
"epoch": 0.27952480782669464,
"grad_norm": 1.1137834248990268,
"learning_rate": 0.00016925569126693136,
"loss": 11.8403,
"num_tokens": 2847177.0,
"step": 400
},
{
"epoch": 0.2802236198462614,
"grad_norm": 1.1431199419756541,
"learning_rate": 0.0001690922409940978,
"loss": 11.9518,
"num_tokens": 2854561.0,
"step": 401
},
{
"epoch": 0.2809224318658281,
"grad_norm": 1.1270384163061071,
"learning_rate": 0.00016892843676475212,
"loss": 11.8864,
"num_tokens": 2861654.0,
"step": 402
},
{
"epoch": 0.28162124388539483,
"grad_norm": 0.9948092143767864,
"learning_rate": 0.00016876427941805622,
"loss": 11.9772,
"num_tokens": 2868359.0,
"step": 403
},
{
"epoch": 0.28232005590496156,
"grad_norm": 1.0080698098570233,
"learning_rate": 0.00016859976979498092,
"loss": 11.8542,
"num_tokens": 2875713.0,
"step": 404
},
{
"epoch": 0.2830188679245283,
"grad_norm": 1.06487609830136,
"learning_rate": 0.00016843490873830178,
"loss": 11.8649,
"num_tokens": 2882164.0,
"step": 405
},
{
"epoch": 0.283717679944095,
"grad_norm": 1.0694632227640828,
"learning_rate": 0.00016826969709259477,
"loss": 12.224,
"num_tokens": 2888466.0,
"step": 406
},
{
"epoch": 0.28441649196366176,
"grad_norm": 1.0329263366216357,
"learning_rate": 0.0001681041357042319,
"loss": 11.8856,
"num_tokens": 2896054.0,
"step": 407
},
{
"epoch": 0.2851153039832285,
"grad_norm": 0.9866874389611432,
"learning_rate": 0.0001679382254213768,
"loss": 12.0074,
"num_tokens": 2903381.0,
"step": 408
},
{
"epoch": 0.28581411600279527,
"grad_norm": 1.0547830834870289,
"learning_rate": 0.00016777196709398065,
"loss": 12.066,
"num_tokens": 2910764.0,
"step": 409
},
{
"epoch": 0.286512928022362,
"grad_norm": 1.1391779846002652,
"learning_rate": 0.00016760536157377754,
"loss": 11.895,
"num_tokens": 2917852.0,
"step": 410
},
{
"epoch": 0.28721174004192873,
"grad_norm": 1.0177173312443486,
"learning_rate": 0.00016743840971428017,
"loss": 11.837,
"num_tokens": 2925426.0,
"step": 411
},
{
"epoch": 0.28791055206149546,
"grad_norm": 1.2162307058502813,
"learning_rate": 0.00016727111237077559,
"loss": 11.9744,
"num_tokens": 2932534.0,
"step": 412
},
{
"epoch": 0.2886093640810622,
"grad_norm": 1.0264120107199801,
"learning_rate": 0.00016710347040032076,
"loss": 11.9857,
"num_tokens": 2939544.0,
"step": 413
},
{
"epoch": 0.2893081761006289,
"grad_norm": 1.3014425270226597,
"learning_rate": 0.0001669354846617381,
"loss": 11.8196,
"num_tokens": 2946773.0,
"step": 414
},
{
"epoch": 0.29000698812019565,
"grad_norm": 1.0796383036656527,
"learning_rate": 0.00016676715601561117,
"loss": 12.0303,
"num_tokens": 2954251.0,
"step": 415
},
{
"epoch": 0.2907058001397624,
"grad_norm": 1.1175059429614296,
"learning_rate": 0.00016659848532428023,
"loss": 12.1162,
"num_tokens": 2961656.0,
"step": 416
},
{
"epoch": 0.2914046121593291,
"grad_norm": 1.0443096335885806,
"learning_rate": 0.00016642947345183774,
"loss": 11.9358,
"num_tokens": 2969059.0,
"step": 417
},
{
"epoch": 0.2921034241788959,
"grad_norm": 1.1183836131543219,
"learning_rate": 0.0001662601212641242,
"loss": 11.8719,
"num_tokens": 2977234.0,
"step": 418
},
{
"epoch": 0.29280223619846263,
"grad_norm": 1.0227099098344798,
"learning_rate": 0.00016609042962872333,
"loss": 11.8904,
"num_tokens": 2984854.0,
"step": 419
},
{
"epoch": 0.29350104821802936,
"grad_norm": 1.245079947315459,
"learning_rate": 0.00016592039941495804,
"loss": 11.6806,
"num_tokens": 2991537.0,
"step": 420
},
{
"epoch": 0.2941998602375961,
"grad_norm": 1.1291214580898223,
"learning_rate": 0.00016575003149388548,
"loss": 11.9606,
"num_tokens": 2998088.0,
"step": 421
},
{
"epoch": 0.2948986722571628,
"grad_norm": 1.0741415576787732,
"learning_rate": 0.00016557932673829311,
"loss": 12.0354,
"num_tokens": 3004876.0,
"step": 422
},
{
"epoch": 0.29559748427672955,
"grad_norm": 1.1830288131183533,
"learning_rate": 0.0001654082860226939,
"loss": 11.9615,
"num_tokens": 3011520.0,
"step": 423
},
{
"epoch": 0.2962962962962963,
"grad_norm": 1.073871290170833,
"learning_rate": 0.00016523691022332185,
"loss": 11.9096,
"num_tokens": 3018294.0,
"step": 424
},
{
"epoch": 0.296995108315863,
"grad_norm": 1.1812984503364992,
"learning_rate": 0.00016506520021812766,
"loss": 11.6186,
"num_tokens": 3026301.0,
"step": 425
},
{
"epoch": 0.2976939203354298,
"grad_norm": 0.9602274072933003,
"learning_rate": 0.00016489315688677416,
"loss": 11.8616,
"num_tokens": 3034166.0,
"step": 426
},
{
"epoch": 0.2983927323549965,
"grad_norm": 1.000734269306996,
"learning_rate": 0.00016472078111063175,
"loss": 11.7692,
"num_tokens": 3041930.0,
"step": 427
},
{
"epoch": 0.29909154437456326,
"grad_norm": 1.1119226968838083,
"learning_rate": 0.00016454807377277398,
"loss": 12.0168,
"num_tokens": 3048767.0,
"step": 428
},
{
"epoch": 0.29979035639413,
"grad_norm": 1.1154210947106469,
"learning_rate": 0.00016437503575797297,
"loss": 11.7643,
"num_tokens": 3055753.0,
"step": 429
},
{
"epoch": 0.3004891684136967,
"grad_norm": 1.0295541363836302,
"learning_rate": 0.00016420166795269475,
"loss": 12.0597,
"num_tokens": 3063120.0,
"step": 430
},
{
"epoch": 0.30118798043326345,
"grad_norm": 1.040910257992528,
"learning_rate": 0.00016402797124509508,
"loss": 11.9731,
"num_tokens": 3070742.0,
"step": 431
},
{
"epoch": 0.3018867924528302,
"grad_norm": 1.0526952080867569,
"learning_rate": 0.00016385394652501445,
"loss": 11.8993,
"num_tokens": 3078056.0,
"step": 432
},
{
"epoch": 0.3025856044723969,
"grad_norm": 1.0896170663855624,
"learning_rate": 0.00016367959468397393,
"loss": 11.8905,
"num_tokens": 3085214.0,
"step": 433
},
{
"epoch": 0.30328441649196364,
"grad_norm": 0.9283859474345019,
"learning_rate": 0.00016350491661517032,
"loss": 11.9101,
"num_tokens": 3092548.0,
"step": 434
},
{
"epoch": 0.3039832285115304,
"grad_norm": 0.986304057530193,
"learning_rate": 0.00016332991321347167,
"loss": 11.8033,
"num_tokens": 3100393.0,
"step": 435
},
{
"epoch": 0.30468204053109715,
"grad_norm": 1.1543372858478307,
"learning_rate": 0.0001631545853754127,
"loss": 12.1051,
"num_tokens": 3106325.0,
"step": 436
},
{
"epoch": 0.3053808525506639,
"grad_norm": 1.1966377576562668,
"learning_rate": 0.0001629789339991902,
"loss": 12.0699,
"num_tokens": 3113193.0,
"step": 437
},
{
"epoch": 0.3060796645702306,
"grad_norm": 1.2855963226674763,
"learning_rate": 0.0001628029599846585,
"loss": 11.9884,
"num_tokens": 3120358.0,
"step": 438
},
{
"epoch": 0.30677847658979734,
"grad_norm": 1.1830063545170302,
"learning_rate": 0.00016262666423332473,
"loss": 11.8667,
"num_tokens": 3127109.0,
"step": 439
},
{
"epoch": 0.3074772886093641,
"grad_norm": 1.0943107370858722,
"learning_rate": 0.00016245004764834422,
"loss": 11.7229,
"num_tokens": 3134426.0,
"step": 440
},
{
"epoch": 0.3081761006289308,
"grad_norm": 1.1001181023873448,
"learning_rate": 0.000162273111134516,
"loss": 11.9723,
"num_tokens": 3141188.0,
"step": 441
},
{
"epoch": 0.30887491264849753,
"grad_norm": 1.238166007571023,
"learning_rate": 0.00016209585559827806,
"loss": 11.9439,
"num_tokens": 3149034.0,
"step": 442
},
{
"epoch": 0.30957372466806427,
"grad_norm": 1.0126345029924844,
"learning_rate": 0.0001619182819477027,
"loss": 12.0405,
"num_tokens": 3156062.0,
"step": 443
},
{
"epoch": 0.31027253668763105,
"grad_norm": 1.1863059832196097,
"learning_rate": 0.0001617403910924919,
"loss": 11.7742,
"num_tokens": 3164141.0,
"step": 444
},
{
"epoch": 0.3109713487071978,
"grad_norm": 0.9943392593170401,
"learning_rate": 0.00016156218394397273,
"loss": 11.862,
"num_tokens": 3172103.0,
"step": 445
},
{
"epoch": 0.3116701607267645,
"grad_norm": 1.1431830948622974,
"learning_rate": 0.0001613836614150926,
"loss": 11.9805,
"num_tokens": 3178950.0,
"step": 446
},
{
"epoch": 0.31236897274633124,
"grad_norm": 1.0570354015055983,
"learning_rate": 0.00016120482442041447,
"loss": 11.8098,
"num_tokens": 3186094.0,
"step": 447
},
{
"epoch": 0.31306778476589797,
"grad_norm": 1.1551238616087514,
"learning_rate": 0.0001610256738761125,
"loss": 12.0324,
"num_tokens": 3193281.0,
"step": 448
},
{
"epoch": 0.3137665967854647,
"grad_norm": 1.153264276075561,
"learning_rate": 0.000160846210699967,
"loss": 11.8432,
"num_tokens": 3200265.0,
"step": 449
},
{
"epoch": 0.31446540880503143,
"grad_norm": 1.0812760599945195,
"learning_rate": 0.0001606664358113599,
"loss": 11.7047,
"num_tokens": 3207548.0,
"step": 450
},
{
"epoch": 0.31516422082459816,
"grad_norm": 1.0695050138356879,
"learning_rate": 0.00016048635013127016,
"loss": 11.9745,
"num_tokens": 3214372.0,
"step": 451
},
{
"epoch": 0.3158630328441649,
"grad_norm": 1.1212081487478163,
"learning_rate": 0.00016030595458226872,
"loss": 11.7326,
"num_tokens": 3221203.0,
"step": 452
},
{
"epoch": 0.3165618448637317,
"grad_norm": 1.0615326542920032,
"learning_rate": 0.00016012525008851403,
"loss": 11.8234,
"num_tokens": 3229249.0,
"step": 453
},
{
"epoch": 0.3172606568832984,
"grad_norm": 1.0924058533685397,
"learning_rate": 0.0001599442375757473,
"loss": 11.8809,
"num_tokens": 3236472.0,
"step": 454
},
{
"epoch": 0.31795946890286514,
"grad_norm": 1.0054105224939138,
"learning_rate": 0.00015976291797128767,
"loss": 11.8353,
"num_tokens": 3243897.0,
"step": 455
},
{
"epoch": 0.31865828092243187,
"grad_norm": 1.0836743188532598,
"learning_rate": 0.00015958129220402744,
"loss": 11.7473,
"num_tokens": 3250533.0,
"step": 456
},
{
"epoch": 0.3193570929419986,
"grad_norm": 1.1331172126168478,
"learning_rate": 0.00015939936120442752,
"loss": 11.9577,
"num_tokens": 3257466.0,
"step": 457
},
{
"epoch": 0.32005590496156533,
"grad_norm": 1.0605723822201687,
"learning_rate": 0.00015921712590451236,
"loss": 11.7791,
"num_tokens": 3263944.0,
"step": 458
},
{
"epoch": 0.32075471698113206,
"grad_norm": 1.2056325385887365,
"learning_rate": 0.00015903458723786544,
"loss": 11.9819,
"num_tokens": 3270543.0,
"step": 459
},
{
"epoch": 0.3214535290006988,
"grad_norm": 1.023946718695128,
"learning_rate": 0.00015885174613962426,
"loss": 11.9992,
"num_tokens": 3277894.0,
"step": 460
},
{
"epoch": 0.3221523410202656,
"grad_norm": 1.164177419749573,
"learning_rate": 0.00015866860354647576,
"loss": 11.908,
"num_tokens": 3284793.0,
"step": 461
},
{
"epoch": 0.3228511530398323,
"grad_norm": 0.974564180236439,
"learning_rate": 0.00015848516039665138,
"loss": 11.9273,
"num_tokens": 3292856.0,
"step": 462
},
{
"epoch": 0.32354996505939904,
"grad_norm": 1.1325311662596682,
"learning_rate": 0.0001583014176299223,
"loss": 11.6866,
"num_tokens": 3300134.0,
"step": 463
},
{
"epoch": 0.32424877707896577,
"grad_norm": 1.0215292754718461,
"learning_rate": 0.00015811737618759468,
"loss": 11.8115,
"num_tokens": 3307091.0,
"step": 464
},
{
"epoch": 0.3249475890985325,
"grad_norm": 1.0991030583076216,
"learning_rate": 0.00015793303701250468,
"loss": 11.7049,
"num_tokens": 3314380.0,
"step": 465
},
{
"epoch": 0.3256464011180992,
"grad_norm": 0.9723239955300694,
"learning_rate": 0.00015774840104901378,
"loss": 12.1343,
"num_tokens": 3322081.0,
"step": 466
},
{
"epoch": 0.32634521313766596,
"grad_norm": 1.1363739747714252,
"learning_rate": 0.000157563469243004,
"loss": 11.9717,
"num_tokens": 3329105.0,
"step": 467
},
{
"epoch": 0.3270440251572327,
"grad_norm": 1.0583331524691464,
"learning_rate": 0.00015737824254187275,
"loss": 11.9133,
"num_tokens": 3336405.0,
"step": 468
},
{
"epoch": 0.3277428371767994,
"grad_norm": 1.0618582403691783,
"learning_rate": 0.00015719272189452824,
"loss": 11.761,
"num_tokens": 3343087.0,
"step": 469
},
{
"epoch": 0.3284416491963662,
"grad_norm": 1.0202628694807843,
"learning_rate": 0.00015700690825138473,
"loss": 12.0182,
"num_tokens": 3350235.0,
"step": 470
},
{
"epoch": 0.32914046121593293,
"grad_norm": 0.9436248037294794,
"learning_rate": 0.00015682080256435724,
"loss": 11.8759,
"num_tokens": 3357485.0,
"step": 471
},
{
"epoch": 0.32983927323549966,
"grad_norm": 1.0693973366929455,
"learning_rate": 0.00015663440578685703,
"loss": 11.9772,
"num_tokens": 3364351.0,
"step": 472
},
{
"epoch": 0.3305380852550664,
"grad_norm": 0.9637424441716854,
"learning_rate": 0.00015644771887378663,
"loss": 11.862,
"num_tokens": 3372249.0,
"step": 473
},
{
"epoch": 0.3312368972746331,
"grad_norm": 1.099099553856128,
"learning_rate": 0.00015626074278153485,
"loss": 11.8989,
"num_tokens": 3379136.0,
"step": 474
},
{
"epoch": 0.33193570929419985,
"grad_norm": 1.0257686815570153,
"learning_rate": 0.000156073478467972,
"loss": 11.8664,
"num_tokens": 3386556.0,
"step": 475
},
{
"epoch": 0.3326345213137666,
"grad_norm": 0.9981099828581506,
"learning_rate": 0.0001558859268924449,
"loss": 11.6664,
"num_tokens": 3394436.0,
"step": 476
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.9935985234202279,
"learning_rate": 0.0001556980890157721,
"loss": 11.8036,
"num_tokens": 3401842.0,
"step": 477
},
{
"epoch": 0.33403214535290005,
"grad_norm": 0.9904221093625248,
"learning_rate": 0.00015550996580023868,
"loss": 11.6766,
"num_tokens": 3409180.0,
"step": 478
},
{
"epoch": 0.33473095737246683,
"grad_norm": 0.9644588966129022,
"learning_rate": 0.00015532155820959165,
"loss": 11.9491,
"num_tokens": 3416900.0,
"step": 479
},
{
"epoch": 0.33542976939203356,
"grad_norm": 0.9681833145177854,
"learning_rate": 0.00015513286720903485,
"loss": 11.9831,
"num_tokens": 3424074.0,
"step": 480
},
{
"epoch": 0.3361285814116003,
"grad_norm": 1.0017211776208106,
"learning_rate": 0.00015494389376522388,
"loss": 12.0019,
"num_tokens": 3431040.0,
"step": 481
},
{
"epoch": 0.336827393431167,
"grad_norm": 1.13259905993011,
"learning_rate": 0.0001547546388462615,
"loss": 11.9244,
"num_tokens": 3437949.0,
"step": 482
},
{
"epoch": 0.33752620545073375,
"grad_norm": 1.0001327316281,
"learning_rate": 0.00015456510342169225,
"loss": 11.6157,
"num_tokens": 3444767.0,
"step": 483
},
{
"epoch": 0.3382250174703005,
"grad_norm": 1.1862346561208217,
"learning_rate": 0.00015437528846249784,
"loss": 11.7979,
"num_tokens": 3452455.0,
"step": 484
},
{
"epoch": 0.3389238294898672,
"grad_norm": 0.9964598534250564,
"learning_rate": 0.00015418519494109185,
"loss": 11.8249,
"num_tokens": 3459075.0,
"step": 485
},
{
"epoch": 0.33962264150943394,
"grad_norm": 1.0499976134478521,
"learning_rate": 0.00015399482383131517,
"loss": 11.8271,
"num_tokens": 3466194.0,
"step": 486
},
{
"epoch": 0.3403214535290007,
"grad_norm": 1.0238007052339033,
"learning_rate": 0.0001538041761084305,
"loss": 11.8863,
"num_tokens": 3472861.0,
"step": 487
},
{
"epoch": 0.34102026554856746,
"grad_norm": 1.031413218747241,
"learning_rate": 0.00015361325274911779,
"loss": 11.8285,
"num_tokens": 3480127.0,
"step": 488
},
{
"epoch": 0.3417190775681342,
"grad_norm": 0.9109038752777893,
"learning_rate": 0.00015342205473146904,
"loss": 11.8997,
"num_tokens": 3487155.0,
"step": 489
},
{
"epoch": 0.3424178895877009,
"grad_norm": 1.0512658553310719,
"learning_rate": 0.00015323058303498324,
"loss": 11.7961,
"num_tokens": 3494773.0,
"step": 490
},
{
"epoch": 0.34311670160726765,
"grad_norm": 0.919699650030833,
"learning_rate": 0.00015303883864056154,
"loss": 11.7389,
"num_tokens": 3502169.0,
"step": 491
},
{
"epoch": 0.3438155136268344,
"grad_norm": 0.9713451834075437,
"learning_rate": 0.00015284682253050198,
"loss": 12.1026,
"num_tokens": 3509575.0,
"step": 492
},
{
"epoch": 0.3445143256464011,
"grad_norm": 1.071656337376466,
"learning_rate": 0.00015265453568849463,
"loss": 12.1382,
"num_tokens": 3516257.0,
"step": 493
},
{
"epoch": 0.34521313766596784,
"grad_norm": 0.9753487437886109,
"learning_rate": 0.0001524619790996166,
"loss": 12.0307,
"num_tokens": 3523550.0,
"step": 494
},
{
"epoch": 0.34591194968553457,
"grad_norm": 1.0299715129309743,
"learning_rate": 0.00015226915375032675,
"loss": 11.8399,
"num_tokens": 3530386.0,
"step": 495
},
{
"epoch": 0.3466107617051013,
"grad_norm": 0.9544143910020928,
"learning_rate": 0.00015207606062846092,
"loss": 11.8551,
"num_tokens": 3537432.0,
"step": 496
},
{
"epoch": 0.3473095737246681,
"grad_norm": 1.0699574494766773,
"learning_rate": 0.00015188270072322664,
"loss": 11.9395,
"num_tokens": 3544343.0,
"step": 497
},
{
"epoch": 0.3480083857442348,
"grad_norm": 0.976543421173678,
"learning_rate": 0.00015168907502519823,
"loss": 11.757,
"num_tokens": 3551437.0,
"step": 498
},
{
"epoch": 0.34870719776380155,
"grad_norm": 1.1037596936096155,
"learning_rate": 0.00015149518452631163,
"loss": 11.9166,
"num_tokens": 3558381.0,
"step": 499
},
{
"epoch": 0.3494060097833683,
"grad_norm": 1.1919912995611013,
"learning_rate": 0.00015130103021985928,
"loss": 11.8007,
"num_tokens": 3565537.0,
"step": 500
},
{
"epoch": 0.350104821802935,
"grad_norm": 1.0537606470925758,
"learning_rate": 0.00015110661310048523,
"loss": 12.0405,
"num_tokens": 3572664.0,
"step": 501
},
{
"epoch": 0.35080363382250174,
"grad_norm": 1.1601489554122157,
"learning_rate": 0.00015091193416417981,
"loss": 11.6978,
"num_tokens": 3579863.0,
"step": 502
},
{
"epoch": 0.35150244584206847,
"grad_norm": 0.9961821792223636,
"learning_rate": 0.00015071699440827462,
"loss": 11.9178,
"num_tokens": 3587973.0,
"step": 503
},
{
"epoch": 0.3522012578616352,
"grad_norm": 1.0220216417014585,
"learning_rate": 0.00015052179483143752,
"loss": 11.8212,
"num_tokens": 3594430.0,
"step": 504
},
{
"epoch": 0.352900069881202,
"grad_norm": 1.0918643967727748,
"learning_rate": 0.00015032633643366727,
"loss": 11.6821,
"num_tokens": 3600877.0,
"step": 505
},
{
"epoch": 0.3535988819007687,
"grad_norm": 1.0053389281755476,
"learning_rate": 0.0001501306202162887,
"loss": 11.7887,
"num_tokens": 3607903.0,
"step": 506
},
{
"epoch": 0.35429769392033544,
"grad_norm": 1.00938096807553,
"learning_rate": 0.0001499346471819474,
"loss": 11.8449,
"num_tokens": 3614967.0,
"step": 507
},
{
"epoch": 0.3549965059399022,
"grad_norm": 0.934535161353447,
"learning_rate": 0.00014973841833460457,
"loss": 11.9196,
"num_tokens": 3622644.0,
"step": 508
},
{
"epoch": 0.3556953179594689,
"grad_norm": 1.0532188197414574,
"learning_rate": 0.00014954193467953196,
"loss": 11.7554,
"num_tokens": 3629496.0,
"step": 509
},
{
"epoch": 0.35639412997903563,
"grad_norm": 1.0320538184578378,
"learning_rate": 0.0001493451972233067,
"loss": 11.8978,
"num_tokens": 3636251.0,
"step": 510
},
{
"epoch": 0.35709294199860236,
"grad_norm": 1.0169160736665257,
"learning_rate": 0.0001491482069738062,
"loss": 11.8188,
"num_tokens": 3643640.0,
"step": 511
},
{
"epoch": 0.3577917540181691,
"grad_norm": 0.9553371806973806,
"learning_rate": 0.00014895096494020274,
"loss": 11.737,
"num_tokens": 3651194.0,
"step": 512
},
{
"epoch": 0.3584905660377358,
"grad_norm": 0.9843783334182804,
"learning_rate": 0.00014875347213295863,
"loss": 11.6681,
"num_tokens": 3658441.0,
"step": 513
},
{
"epoch": 0.3591893780573026,
"grad_norm": 0.9703948672844441,
"learning_rate": 0.00014855572956382082,
"loss": 11.7661,
"num_tokens": 3665170.0,
"step": 514
},
{
"epoch": 0.35988819007686934,
"grad_norm": 0.9634487040177081,
"learning_rate": 0.0001483577382458158,
"loss": 11.8308,
"num_tokens": 3672474.0,
"step": 515
},
{
"epoch": 0.36058700209643607,
"grad_norm": 0.9876860191525895,
"learning_rate": 0.00014815949919324444,
"loss": 11.6739,
"num_tokens": 3679386.0,
"step": 516
},
{
"epoch": 0.3612858141160028,
"grad_norm": 1.0064827868977257,
"learning_rate": 0.00014796101342167664,
"loss": 11.9353,
"num_tokens": 3687308.0,
"step": 517
},
{
"epoch": 0.36198462613556953,
"grad_norm": 1.000619260490717,
"learning_rate": 0.00014776228194794623,
"loss": 11.7515,
"num_tokens": 3694073.0,
"step": 518
},
{
"epoch": 0.36268343815513626,
"grad_norm": 0.9365883961538043,
"learning_rate": 0.00014756330579014591,
"loss": 11.9024,
"num_tokens": 3701387.0,
"step": 519
},
{
"epoch": 0.363382250174703,
"grad_norm": 1.338573083068865,
"learning_rate": 0.0001473640859676217,
"loss": 11.5576,
"num_tokens": 3708600.0,
"step": 520
},
{
"epoch": 0.3640810621942697,
"grad_norm": 1.1132400116662757,
"learning_rate": 0.00014716462350096803,
"loss": 11.8507,
"num_tokens": 3716979.0,
"step": 521
},
{
"epoch": 0.36477987421383645,
"grad_norm": 1.3193521575988498,
"learning_rate": 0.0001469649194120224,
"loss": 11.8097,
"num_tokens": 3723012.0,
"step": 522
},
{
"epoch": 0.36547868623340324,
"grad_norm": 1.0403268561853904,
"learning_rate": 0.00014676497472385994,
"loss": 11.6589,
"num_tokens": 3730363.0,
"step": 523
},
{
"epoch": 0.36617749825296997,
"grad_norm": 1.2151940133509085,
"learning_rate": 0.0001465647904607886,
"loss": 11.7648,
"num_tokens": 3737054.0,
"step": 524
},
{
"epoch": 0.3668763102725367,
"grad_norm": 1.1114834210539923,
"learning_rate": 0.00014636436764834353,
"loss": 11.7175,
"num_tokens": 3743973.0,
"step": 525
},
{
"epoch": 0.36757512229210343,
"grad_norm": 1.0182021038420608,
"learning_rate": 0.000146163707313282,
"loss": 12.0104,
"num_tokens": 3750646.0,
"step": 526
},
{
"epoch": 0.36827393431167016,
"grad_norm": 1.0658085747221677,
"learning_rate": 0.00014596281048357806,
"loss": 11.8781,
"num_tokens": 3758294.0,
"step": 527
},
{
"epoch": 0.3689727463312369,
"grad_norm": 0.963002880311189,
"learning_rate": 0.0001457616781884173,
"loss": 11.6855,
"num_tokens": 3765428.0,
"step": 528
},
{
"epoch": 0.3696715583508036,
"grad_norm": 1.1131967814264112,
"learning_rate": 0.00014556031145819168,
"loss": 11.8129,
"num_tokens": 3772483.0,
"step": 529
},
{
"epoch": 0.37037037037037035,
"grad_norm": 0.9692436092208317,
"learning_rate": 0.0001453587113244941,
"loss": 11.8568,
"num_tokens": 3779599.0,
"step": 530
},
{
"epoch": 0.3710691823899371,
"grad_norm": 1.101540880954833,
"learning_rate": 0.00014515687882011313,
"loss": 11.789,
"num_tokens": 3786663.0,
"step": 531
},
{
"epoch": 0.37176799440950387,
"grad_norm": 1.050559906664223,
"learning_rate": 0.00014495481497902788,
"loss": 11.7851,
"num_tokens": 3793917.0,
"step": 532
},
{
"epoch": 0.3724668064290706,
"grad_norm": 1.1741989565793984,
"learning_rate": 0.00014475252083640246,
"loss": 11.8387,
"num_tokens": 3800383.0,
"step": 533
},
{
"epoch": 0.3731656184486373,
"grad_norm": 1.1113664176342144,
"learning_rate": 0.00014454999742858092,
"loss": 11.7885,
"num_tokens": 3807421.0,
"step": 534
},
{
"epoch": 0.37386443046820406,
"grad_norm": 1.180064570726451,
"learning_rate": 0.0001443472457930817,
"loss": 11.8684,
"num_tokens": 3814470.0,
"step": 535
},
{
"epoch": 0.3745632424877708,
"grad_norm": 1.2084454645287401,
"learning_rate": 0.0001441442669685926,
"loss": 11.8665,
"num_tokens": 3821117.0,
"step": 536
},
{
"epoch": 0.3752620545073375,
"grad_norm": 1.0286785364308508,
"learning_rate": 0.00014394106199496517,
"loss": 11.94,
"num_tokens": 3828050.0,
"step": 537
},
{
"epoch": 0.37596086652690425,
"grad_norm": 0.9866126433136005,
"learning_rate": 0.00014373763191320954,
"loss": 11.8129,
"num_tokens": 3835858.0,
"step": 538
},
{
"epoch": 0.376659678546471,
"grad_norm": 0.968366405112258,
"learning_rate": 0.00014353397776548912,
"loss": 11.8883,
"num_tokens": 3843141.0,
"step": 539
},
{
"epoch": 0.37735849056603776,
"grad_norm": 0.9164242552730942,
"learning_rate": 0.00014333010059511505,
"loss": 11.8982,
"num_tokens": 3850497.0,
"step": 540
},
{
"epoch": 0.3780573025856045,
"grad_norm": 0.9068419500979179,
"learning_rate": 0.0001431260014465412,
"loss": 11.3929,
"num_tokens": 3857621.0,
"step": 541
},
{
"epoch": 0.3787561146051712,
"grad_norm": 1.0163110836355063,
"learning_rate": 0.00014292168136535854,
"loss": 11.824,
"num_tokens": 3864403.0,
"step": 542
},
{
"epoch": 0.37945492662473795,
"grad_norm": 0.9074023579901379,
"learning_rate": 0.00014271714139828983,
"loss": 11.6444,
"num_tokens": 3871744.0,
"step": 543
},
{
"epoch": 0.3801537386443047,
"grad_norm": 0.9866662139219797,
"learning_rate": 0.0001425123825931843,
"loss": 11.725,
"num_tokens": 3879289.0,
"step": 544
},
{
"epoch": 0.3808525506638714,
"grad_norm": 0.9073257309786021,
"learning_rate": 0.00014230740599901231,
"loss": 11.6273,
"num_tokens": 3886810.0,
"step": 545
},
{
"epoch": 0.38155136268343814,
"grad_norm": 0.9207144140267274,
"learning_rate": 0.00014210221266585998,
"loss": 11.8695,
"num_tokens": 3894520.0,
"step": 546
},
{
"epoch": 0.3822501747030049,
"grad_norm": 0.93767292493419,
"learning_rate": 0.0001418968036449237,
"loss": 11.7849,
"num_tokens": 3901356.0,
"step": 547
},
{
"epoch": 0.3829489867225716,
"grad_norm": 0.9004540028187058,
"learning_rate": 0.0001416911799885049,
"loss": 11.7171,
"num_tokens": 3908231.0,
"step": 548
},
{
"epoch": 0.3836477987421384,
"grad_norm": 0.9581250266093889,
"learning_rate": 0.00014148534275000444,
"loss": 12.0426,
"num_tokens": 3915548.0,
"step": 549
},
{
"epoch": 0.3843466107617051,
"grad_norm": 1.0146876136163,
"learning_rate": 0.0001412792929839175,
"loss": 11.7198,
"num_tokens": 3922294.0,
"step": 550
},
{
"epoch": 0.38504542278127185,
"grad_norm": 0.9216708322241084,
"learning_rate": 0.00014107303174582794,
"loss": 11.662,
"num_tokens": 3929145.0,
"step": 551
},
{
"epoch": 0.3857442348008386,
"grad_norm": 1.0481953881329724,
"learning_rate": 0.00014086656009240306,
"loss": 11.7744,
"num_tokens": 3936061.0,
"step": 552
},
{
"epoch": 0.3864430468204053,
"grad_norm": 0.9829096758175492,
"learning_rate": 0.00014065987908138804,
"loss": 11.9194,
"num_tokens": 3942508.0,
"step": 553
},
{
"epoch": 0.38714185883997204,
"grad_norm": 1.0030318786032055,
"learning_rate": 0.00014045298977160057,
"loss": 11.8821,
"num_tokens": 3949004.0,
"step": 554
},
{
"epoch": 0.38784067085953877,
"grad_norm": 0.850860339193017,
"learning_rate": 0.00014024589322292555,
"loss": 11.7397,
"num_tokens": 3956901.0,
"step": 555
},
{
"epoch": 0.3885394828791055,
"grad_norm": 1.0103040436308783,
"learning_rate": 0.00014003859049630942,
"loss": 11.8767,
"num_tokens": 3964133.0,
"step": 556
},
{
"epoch": 0.38923829489867223,
"grad_norm": 0.9987701612040082,
"learning_rate": 0.000139831082653755,
"loss": 12.0504,
"num_tokens": 3971294.0,
"step": 557
},
{
"epoch": 0.389937106918239,
"grad_norm": 1.1429673333072519,
"learning_rate": 0.00013962337075831583,
"loss": 11.8142,
"num_tokens": 3977426.0,
"step": 558
},
{
"epoch": 0.39063591893780575,
"grad_norm": 0.9688917144423852,
"learning_rate": 0.00013941545587409075,
"loss": 11.698,
"num_tokens": 3984492.0,
"step": 559
},
{
"epoch": 0.3913347309573725,
"grad_norm": 1.0308616199533485,
"learning_rate": 0.00013920733906621862,
"loss": 11.9656,
"num_tokens": 3991120.0,
"step": 560
},
{
"epoch": 0.3920335429769392,
"grad_norm": 1.013985978635021,
"learning_rate": 0.00013899902140087272,
"loss": 11.8287,
"num_tokens": 3997971.0,
"step": 561
},
{
"epoch": 0.39273235499650594,
"grad_norm": 0.9071478462521866,
"learning_rate": 0.00013879050394525523,
"loss": 11.7164,
"num_tokens": 4005376.0,
"step": 562
},
{
"epoch": 0.39343116701607267,
"grad_norm": 1.0624818582024294,
"learning_rate": 0.00013858178776759197,
"loss": 11.8942,
"num_tokens": 4012471.0,
"step": 563
},
{
"epoch": 0.3941299790356394,
"grad_norm": 0.8936125162275714,
"learning_rate": 0.00013837287393712666,
"loss": 11.7496,
"num_tokens": 4019054.0,
"step": 564
},
{
"epoch": 0.39482879105520613,
"grad_norm": 1.0912085369465605,
"learning_rate": 0.00013816376352411574,
"loss": 11.846,
"num_tokens": 4025454.0,
"step": 565
},
{
"epoch": 0.39552760307477286,
"grad_norm": 0.9032002517173796,
"learning_rate": 0.00013795445759982262,
"loss": 11.5647,
"num_tokens": 4032992.0,
"step": 566
},
{
"epoch": 0.39622641509433965,
"grad_norm": 0.9257844804709106,
"learning_rate": 0.00013774495723651236,
"loss": 11.8157,
"num_tokens": 4040064.0,
"step": 567
},
{
"epoch": 0.3969252271139064,
"grad_norm": 1.0334706226849835,
"learning_rate": 0.0001375352635074461,
"loss": 11.6075,
"num_tokens": 4046967.0,
"step": 568
},
{
"epoch": 0.3976240391334731,
"grad_norm": 0.8363862917027438,
"learning_rate": 0.0001373253774868756,
"loss": 11.7502,
"num_tokens": 4055301.0,
"step": 569
},
{
"epoch": 0.39832285115303984,
"grad_norm": 0.9385148033516746,
"learning_rate": 0.00013711530025003766,
"loss": 11.7727,
"num_tokens": 4062431.0,
"step": 570
},
{
"epoch": 0.39902166317260657,
"grad_norm": 0.8299416376395171,
"learning_rate": 0.00013690503287314883,
"loss": 11.519,
"num_tokens": 4070230.0,
"step": 571
},
{
"epoch": 0.3997204751921733,
"grad_norm": 0.9976023645792326,
"learning_rate": 0.00013669457643339955,
"loss": 11.6711,
"num_tokens": 4077148.0,
"step": 572
},
{
"epoch": 0.40041928721174,
"grad_norm": 0.8855228788782702,
"learning_rate": 0.00013648393200894893,
"loss": 11.5901,
"num_tokens": 4084168.0,
"step": 573
},
{
"epoch": 0.40111809923130676,
"grad_norm": 0.9355806400309514,
"learning_rate": 0.00013627310067891913,
"loss": 11.7706,
"num_tokens": 4092313.0,
"step": 574
},
{
"epoch": 0.4018169112508735,
"grad_norm": 0.8887017159796101,
"learning_rate": 0.00013606208352338973,
"loss": 11.6336,
"num_tokens": 4099578.0,
"step": 575
},
{
"epoch": 0.4025157232704403,
"grad_norm": 0.896143619832332,
"learning_rate": 0.00013585088162339231,
"loss": 11.8035,
"num_tokens": 4106534.0,
"step": 576
},
{
"epoch": 0.403214535290007,
"grad_norm": 0.9717296350589316,
"learning_rate": 0.00013563949606090503,
"loss": 11.6552,
"num_tokens": 4113134.0,
"step": 577
},
{
"epoch": 0.40391334730957373,
"grad_norm": 0.8799694598345534,
"learning_rate": 0.00013542792791884674,
"loss": 11.6327,
"num_tokens": 4120863.0,
"step": 578
},
{
"epoch": 0.40461215932914046,
"grad_norm": 0.8714214123608119,
"learning_rate": 0.00013521617828107175,
"loss": 11.6445,
"num_tokens": 4127780.0,
"step": 579
},
{
"epoch": 0.4053109713487072,
"grad_norm": 0.9368995617188041,
"learning_rate": 0.00013500424823236412,
"loss": 11.7067,
"num_tokens": 4134829.0,
"step": 580
},
{
"epoch": 0.4060097833682739,
"grad_norm": 0.8220877834169437,
"learning_rate": 0.0001347921388584322,
"loss": 11.4554,
"num_tokens": 4142834.0,
"step": 581
},
{
"epoch": 0.40670859538784065,
"grad_norm": 0.9223723153491369,
"learning_rate": 0.000134579851245903,
"loss": 11.8592,
"num_tokens": 4149774.0,
"step": 582
},
{
"epoch": 0.4074074074074074,
"grad_norm": 0.9344353780152859,
"learning_rate": 0.00013436738648231656,
"loss": 11.7621,
"num_tokens": 4156412.0,
"step": 583
},
{
"epoch": 0.40810621942697417,
"grad_norm": 0.9224628889971425,
"learning_rate": 0.00013415474565612058,
"loss": 11.8511,
"num_tokens": 4163132.0,
"step": 584
},
{
"epoch": 0.4088050314465409,
"grad_norm": 0.9265931390375924,
"learning_rate": 0.00013394192985666465,
"loss": 11.7475,
"num_tokens": 4170145.0,
"step": 585
},
{
"epoch": 0.40950384346610763,
"grad_norm": 0.9535182759992712,
"learning_rate": 0.0001337289401741947,
"loss": 11.6528,
"num_tokens": 4177826.0,
"step": 586
},
{
"epoch": 0.41020265548567436,
"grad_norm": 0.9443710116876387,
"learning_rate": 0.0001335157776998476,
"loss": 11.6145,
"num_tokens": 4184556.0,
"step": 587
},
{
"epoch": 0.4109014675052411,
"grad_norm": 1.004187267294352,
"learning_rate": 0.00013330244352564527,
"loss": 11.6601,
"num_tokens": 4191042.0,
"step": 588
},
{
"epoch": 0.4116002795248078,
"grad_norm": 1.0157069700491397,
"learning_rate": 0.0001330889387444893,
"loss": 11.8948,
"num_tokens": 4197862.0,
"step": 589
},
{
"epoch": 0.41229909154437455,
"grad_norm": 0.9120585645887671,
"learning_rate": 0.00013287526445015531,
"loss": 11.824,
"num_tokens": 4205404.0,
"step": 590
},
{
"epoch": 0.4129979035639413,
"grad_norm": 0.8937750402751362,
"learning_rate": 0.0001326614217372873,
"loss": 11.6915,
"num_tokens": 4212599.0,
"step": 591
},
{
"epoch": 0.413696715583508,
"grad_norm": 0.9924820650387385,
"learning_rate": 0.0001324474117013921,
"loss": 11.9223,
"num_tokens": 4219704.0,
"step": 592
},
{
"epoch": 0.4143955276030748,
"grad_norm": 1.0041177228053997,
"learning_rate": 0.00013223323543883373,
"loss": 11.8224,
"num_tokens": 4226784.0,
"step": 593
},
{
"epoch": 0.41509433962264153,
"grad_norm": 0.9741629017640787,
"learning_rate": 0.0001320188940468277,
"loss": 11.6504,
"num_tokens": 4233717.0,
"step": 594
},
{
"epoch": 0.41579315164220826,
"grad_norm": 0.9601854433244981,
"learning_rate": 0.0001318043886234356,
"loss": 11.8218,
"num_tokens": 4240647.0,
"step": 595
},
{
"epoch": 0.416491963661775,
"grad_norm": 0.9645423908468603,
"learning_rate": 0.00013158972026755926,
"loss": 11.7377,
"num_tokens": 4247672.0,
"step": 596
},
{
"epoch": 0.4171907756813417,
"grad_norm": 0.9316349929157852,
"learning_rate": 0.0001313748900789352,
"loss": 11.6241,
"num_tokens": 4255152.0,
"step": 597
},
{
"epoch": 0.41788958770090845,
"grad_norm": 0.9537689079354016,
"learning_rate": 0.0001311598991581291,
"loss": 11.8703,
"num_tokens": 4261453.0,
"step": 598
},
{
"epoch": 0.4185883997204752,
"grad_norm": 0.9289919471319109,
"learning_rate": 0.00013094474860652987,
"loss": 11.6782,
"num_tokens": 4268508.0,
"step": 599
},
{
"epoch": 0.4192872117400419,
"grad_norm": 0.8914635789877776,
"learning_rate": 0.00013072943952634447,
"loss": 11.7243,
"num_tokens": 4275543.0,
"step": 600
},
{
"epoch": 0.41998602375960864,
"grad_norm": 0.9440659795291676,
"learning_rate": 0.00013051397302059171,
"loss": 11.8755,
"num_tokens": 4282191.0,
"step": 601
},
{
"epoch": 0.4206848357791754,
"grad_norm": 0.9585209926703763,
"learning_rate": 0.00013029835019309714,
"loss": 11.8258,
"num_tokens": 4289593.0,
"step": 602
},
{
"epoch": 0.42138364779874216,
"grad_norm": 0.7867669798656702,
"learning_rate": 0.000130082572148487,
"loss": 11.4998,
"num_tokens": 4297297.0,
"step": 603
},
{
"epoch": 0.4220824598183089,
"grad_norm": 0.8949794126524566,
"learning_rate": 0.00012986663999218261,
"loss": 11.7613,
"num_tokens": 4304161.0,
"step": 604
},
{
"epoch": 0.4227812718378756,
"grad_norm": 0.893864879487913,
"learning_rate": 0.00012965055483039507,
"loss": 11.5257,
"num_tokens": 4311640.0,
"step": 605
},
{
"epoch": 0.42348008385744235,
"grad_norm": 0.9586722537494932,
"learning_rate": 0.00012943431777011902,
"loss": 11.839,
"num_tokens": 4318619.0,
"step": 606
},
{
"epoch": 0.4241788958770091,
"grad_norm": 0.9537730403425981,
"learning_rate": 0.00012921792991912753,
"loss": 11.8218,
"num_tokens": 4325488.0,
"step": 607
},
{
"epoch": 0.4248777078965758,
"grad_norm": 0.8262921885502388,
"learning_rate": 0.00012900139238596598,
"loss": 11.4973,
"num_tokens": 4332936.0,
"step": 608
},
{
"epoch": 0.42557651991614254,
"grad_norm": 1.0157982223498707,
"learning_rate": 0.00012878470627994664,
"loss": 11.5374,
"num_tokens": 4339915.0,
"step": 609
},
{
"epoch": 0.42627533193570927,
"grad_norm": 0.8575390048878171,
"learning_rate": 0.0001285678727111429,
"loss": 11.5607,
"num_tokens": 4347052.0,
"step": 610
},
{
"epoch": 0.42697414395527605,
"grad_norm": 0.9730947204204807,
"learning_rate": 0.00012835089279038362,
"loss": 11.7061,
"num_tokens": 4353752.0,
"step": 611
},
{
"epoch": 0.4276729559748428,
"grad_norm": 0.9483802234693168,
"learning_rate": 0.00012813376762924733,
"loss": 11.836,
"num_tokens": 4361038.0,
"step": 612
},
{
"epoch": 0.4283717679944095,
"grad_norm": 0.9216292136491858,
"learning_rate": 0.0001279164983400568,
"loss": 11.7437,
"num_tokens": 4368560.0,
"step": 613
},
{
"epoch": 0.42907058001397624,
"grad_norm": 0.8425238285285168,
"learning_rate": 0.00012769908603587292,
"loss": 11.5207,
"num_tokens": 4376222.0,
"step": 614
},
{
"epoch": 0.429769392033543,
"grad_norm": 0.8896306896847135,
"learning_rate": 0.0001274815318304894,
"loss": 11.6286,
"num_tokens": 4383884.0,
"step": 615
},
{
"epoch": 0.4304682040531097,
"grad_norm": 0.9354680205418325,
"learning_rate": 0.0001272638368384269,
"loss": 11.6862,
"num_tokens": 4390978.0,
"step": 616
},
{
"epoch": 0.43116701607267643,
"grad_norm": 0.9479829150578614,
"learning_rate": 0.00012704600217492725,
"loss": 11.9,
"num_tokens": 4398693.0,
"step": 617
},
{
"epoch": 0.43186582809224316,
"grad_norm": 0.9515538105951693,
"learning_rate": 0.0001268280289559479,
"loss": 11.7401,
"num_tokens": 4405911.0,
"step": 618
},
{
"epoch": 0.43256464011180995,
"grad_norm": 0.8605687632190838,
"learning_rate": 0.00012660991829815602,
"loss": 11.6708,
"num_tokens": 4413468.0,
"step": 619
},
{
"epoch": 0.4332634521313767,
"grad_norm": 0.9579766386486308,
"learning_rate": 0.00012639167131892293,
"loss": 11.8158,
"num_tokens": 4420628.0,
"step": 620
},
{
"epoch": 0.4339622641509434,
"grad_norm": 0.9305350787646466,
"learning_rate": 0.0001261732891363183,
"loss": 11.7188,
"num_tokens": 4427900.0,
"step": 621
},
{
"epoch": 0.43466107617051014,
"grad_norm": 0.9798863355563663,
"learning_rate": 0.0001259547728691045,
"loss": 11.774,
"num_tokens": 4434789.0,
"step": 622
},
{
"epoch": 0.43535988819007687,
"grad_norm": 0.9247977118101478,
"learning_rate": 0.00012573612363673067,
"loss": 11.8669,
"num_tokens": 4442355.0,
"step": 623
},
{
"epoch": 0.4360587002096436,
"grad_norm": 0.9072492786959844,
"learning_rate": 0.00012551734255932727,
"loss": 11.5485,
"num_tokens": 4449354.0,
"step": 624
},
{
"epoch": 0.43675751222921033,
"grad_norm": 0.9280486905034977,
"learning_rate": 0.0001252984307577001,
"loss": 11.4915,
"num_tokens": 4456996.0,
"step": 625
},
{
"epoch": 0.43745632424877706,
"grad_norm": 0.9451836457137859,
"learning_rate": 0.00012507938935332478,
"loss": 11.6188,
"num_tokens": 4464108.0,
"step": 626
},
{
"epoch": 0.4381551362683438,
"grad_norm": 0.8990536388212861,
"learning_rate": 0.00012486021946834068,
"loss": 11.4984,
"num_tokens": 4471782.0,
"step": 627
},
{
"epoch": 0.4388539482879106,
"grad_norm": 0.9750832409444402,
"learning_rate": 0.00012464092222554552,
"loss": 11.4983,
"num_tokens": 4478870.0,
"step": 628
},
{
"epoch": 0.4395527603074773,
"grad_norm": 0.8506726198904068,
"learning_rate": 0.00012442149874838948,
"loss": 11.4597,
"num_tokens": 4486006.0,
"step": 629
},
{
"epoch": 0.44025157232704404,
"grad_norm": 0.9199807285091958,
"learning_rate": 0.00012420195016096933,
"loss": 11.8458,
"num_tokens": 4493433.0,
"step": 630
},
{
"epoch": 0.44095038434661077,
"grad_norm": 0.8020311533791471,
"learning_rate": 0.00012398227758802285,
"loss": 11.4522,
"num_tokens": 4501615.0,
"step": 631
},
{
"epoch": 0.4416491963661775,
"grad_norm": 0.9335899963926214,
"learning_rate": 0.00012376248215492297,
"loss": 11.7816,
"num_tokens": 4508453.0,
"step": 632
},
{
"epoch": 0.44234800838574423,
"grad_norm": 0.9347628048111192,
"learning_rate": 0.000123542564987672,
"loss": 11.6779,
"num_tokens": 4515645.0,
"step": 633
},
{
"epoch": 0.44304682040531096,
"grad_norm": 0.8555235370359207,
"learning_rate": 0.00012332252721289594,
"loss": 11.3559,
"num_tokens": 4522914.0,
"step": 634
},
{
"epoch": 0.4437456324248777,
"grad_norm": 0.9434725175049635,
"learning_rate": 0.00012310236995783866,
"loss": 11.8245,
"num_tokens": 4530012.0,
"step": 635
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.894759929105657,
"learning_rate": 0.00012288209435035605,
"loss": 11.633,
"num_tokens": 4536666.0,
"step": 636
},
{
"epoch": 0.4451432564640112,
"grad_norm": 0.8939776930969006,
"learning_rate": 0.00012266170151891036,
"loss": 11.6174,
"num_tokens": 4543673.0,
"step": 637
},
{
"epoch": 0.44584206848357794,
"grad_norm": 0.9419395016806058,
"learning_rate": 0.00012244119259256442,
"loss": 11.6986,
"num_tokens": 4550285.0,
"step": 638
},
{
"epoch": 0.44654088050314467,
"grad_norm": 0.900919444140853,
"learning_rate": 0.00012222056870097572,
"loss": 11.8951,
"num_tokens": 4557630.0,
"step": 639
},
{
"epoch": 0.4472396925227114,
"grad_norm": 0.9368150858158782,
"learning_rate": 0.00012199983097439079,
"loss": 11.7818,
"num_tokens": 4564552.0,
"step": 640
},
{
"epoch": 0.4479385045422781,
"grad_norm": 0.8542741429298996,
"learning_rate": 0.00012177898054363923,
"loss": 11.6147,
"num_tokens": 4572109.0,
"step": 641
},
{
"epoch": 0.44863731656184486,
"grad_norm": 0.9096968317268118,
"learning_rate": 0.00012155801854012816,
"loss": 11.6874,
"num_tokens": 4579365.0,
"step": 642
},
{
"epoch": 0.4493361285814116,
"grad_norm": 1.0188620411597427,
"learning_rate": 0.00012133694609583615,
"loss": 11.6009,
"num_tokens": 4585759.0,
"step": 643
},
{
"epoch": 0.4500349406009783,
"grad_norm": 0.9031994994109169,
"learning_rate": 0.00012111576434330766,
"loss": 11.7123,
"num_tokens": 4592247.0,
"step": 644
},
{
"epoch": 0.45073375262054505,
"grad_norm": 0.9149162319112071,
"learning_rate": 0.00012089447441564705,
"loss": 11.6632,
"num_tokens": 4599348.0,
"step": 645
},
{
"epoch": 0.45143256464011183,
"grad_norm": 0.9094933293195947,
"learning_rate": 0.00012067307744651288,
"loss": 11.4616,
"num_tokens": 4606162.0,
"step": 646
},
{
"epoch": 0.45213137665967856,
"grad_norm": 0.8663113470163256,
"learning_rate": 0.00012045157457011211,
"loss": 11.4333,
"num_tokens": 4612972.0,
"step": 647
},
{
"epoch": 0.4528301886792453,
"grad_norm": 0.8567265177969963,
"learning_rate": 0.00012022996692119424,
"loss": 11.647,
"num_tokens": 4620976.0,
"step": 648
},
{
"epoch": 0.453529000698812,
"grad_norm": 0.8601920989369755,
"learning_rate": 0.00012000825563504547,
"loss": 11.768,
"num_tokens": 4629303.0,
"step": 649
},
{
"epoch": 0.45422781271837875,
"grad_norm": 0.8865942670819554,
"learning_rate": 0.000119786441847483,
"loss": 11.5788,
"num_tokens": 4636638.0,
"step": 650
},
{
"epoch": 0.4549266247379455,
"grad_norm": 0.8777317880659796,
"learning_rate": 0.00011956452669484908,
"loss": 11.5788,
"num_tokens": 4643800.0,
"step": 651
},
{
"epoch": 0.4556254367575122,
"grad_norm": 0.9022730838918819,
"learning_rate": 0.0001193425113140053,
"loss": 11.7337,
"num_tokens": 4650901.0,
"step": 652
},
{
"epoch": 0.45632424877707894,
"grad_norm": 0.9200091529677351,
"learning_rate": 0.00011912039684232674,
"loss": 11.7567,
"num_tokens": 4657604.0,
"step": 653
},
{
"epoch": 0.4570230607966457,
"grad_norm": 0.9853157147946073,
"learning_rate": 0.000118898184417696,
"loss": 11.5672,
"num_tokens": 4664504.0,
"step": 654
},
{
"epoch": 0.45772187281621246,
"grad_norm": 0.9357526749090548,
"learning_rate": 0.00011867587517849757,
"loss": 11.6281,
"num_tokens": 4670477.0,
"step": 655
},
{
"epoch": 0.4584206848357792,
"grad_norm": 0.8505438973909345,
"learning_rate": 0.0001184534702636119,
"loss": 11.5583,
"num_tokens": 4677858.0,
"step": 656
},
{
"epoch": 0.4591194968553459,
"grad_norm": 1.0023298426002707,
"learning_rate": 0.00011823097081240964,
"loss": 11.5818,
"num_tokens": 4684626.0,
"step": 657
},
{
"epoch": 0.45981830887491265,
"grad_norm": 0.9108337238330952,
"learning_rate": 0.00011800837796474561,
"loss": 11.837,
"num_tokens": 4691582.0,
"step": 658
},
{
"epoch": 0.4605171208944794,
"grad_norm": 0.8958214114567762,
"learning_rate": 0.00011778569286095329,
"loss": 11.6986,
"num_tokens": 4699398.0,
"step": 659
},
{
"epoch": 0.4612159329140461,
"grad_norm": 0.8418306799615523,
"learning_rate": 0.00011756291664183859,
"loss": 11.6767,
"num_tokens": 4707448.0,
"step": 660
},
{
"epoch": 0.46191474493361284,
"grad_norm": 0.9201936424777087,
"learning_rate": 0.00011734005044867426,
"loss": 11.6697,
"num_tokens": 4714120.0,
"step": 661
},
{
"epoch": 0.46261355695317957,
"grad_norm": 0.9474217118708662,
"learning_rate": 0.00011711709542319411,
"loss": 11.6511,
"num_tokens": 4721137.0,
"step": 662
},
{
"epoch": 0.46331236897274636,
"grad_norm": 1.0911097821663276,
"learning_rate": 0.00011689405270758684,
"loss": 11.8961,
"num_tokens": 4727881.0,
"step": 663
},
{
"epoch": 0.4640111809923131,
"grad_norm": 0.9131509898854648,
"learning_rate": 0.00011667092344449053,
"loss": 11.7066,
"num_tokens": 4735809.0,
"step": 664
},
{
"epoch": 0.4647099930118798,
"grad_norm": 1.008550314106262,
"learning_rate": 0.00011644770877698654,
"loss": 11.8672,
"num_tokens": 4742094.0,
"step": 665
},
{
"epoch": 0.46540880503144655,
"grad_norm": 0.8283668414360017,
"learning_rate": 0.00011622440984859384,
"loss": 11.537,
"num_tokens": 4749810.0,
"step": 666
},
{
"epoch": 0.4661076170510133,
"grad_norm": 1.046661693335407,
"learning_rate": 0.00011600102780326296,
"loss": 11.6646,
"num_tokens": 4756916.0,
"step": 667
},
{
"epoch": 0.46680642907058,
"grad_norm": 0.8517203229245645,
"learning_rate": 0.00011577756378537033,
"loss": 11.6898,
"num_tokens": 4764365.0,
"step": 668
},
{
"epoch": 0.46750524109014674,
"grad_norm": 0.9243441693404036,
"learning_rate": 0.00011555401893971229,
"loss": 11.5513,
"num_tokens": 4771335.0,
"step": 669
},
{
"epoch": 0.46820405310971347,
"grad_norm": 0.8779927585010323,
"learning_rate": 0.00011533039441149926,
"loss": 11.5647,
"num_tokens": 4778789.0,
"step": 670
},
{
"epoch": 0.4689028651292802,
"grad_norm": 0.8629256515595302,
"learning_rate": 0.00011510669134634984,
"loss": 11.6128,
"num_tokens": 4786533.0,
"step": 671
},
{
"epoch": 0.469601677148847,
"grad_norm": 0.919436534596015,
"learning_rate": 0.000114882910890285,
"loss": 11.7525,
"num_tokens": 4793395.0,
"step": 672
},
{
"epoch": 0.4703004891684137,
"grad_norm": 0.8679940499104837,
"learning_rate": 0.00011465905418972216,
"loss": 11.6152,
"num_tokens": 4800501.0,
"step": 673
},
{
"epoch": 0.47099930118798045,
"grad_norm": 0.8595368570624692,
"learning_rate": 0.00011443512239146941,
"loss": 11.4401,
"num_tokens": 4807730.0,
"step": 674
},
{
"epoch": 0.4716981132075472,
"grad_norm": 0.9418065073409608,
"learning_rate": 0.00011421111664271946,
"loss": 11.9013,
"num_tokens": 4814631.0,
"step": 675
},
{
"epoch": 0.4723969252271139,
"grad_norm": 0.8698638525941518,
"learning_rate": 0.00011398703809104391,
"loss": 11.7185,
"num_tokens": 4821859.0,
"step": 676
},
{
"epoch": 0.47309573724668064,
"grad_norm": 0.9484728369720604,
"learning_rate": 0.00011376288788438734,
"loss": 11.783,
"num_tokens": 4829001.0,
"step": 677
},
{
"epoch": 0.47379454926624737,
"grad_norm": 0.8223477112780467,
"learning_rate": 0.00011353866717106137,
"loss": 11.8393,
"num_tokens": 4836233.0,
"step": 678
},
{
"epoch": 0.4744933612858141,
"grad_norm": 0.831097262102507,
"learning_rate": 0.0001133143770997389,
"loss": 11.6262,
"num_tokens": 4844216.0,
"step": 679
},
{
"epoch": 0.4751921733053808,
"grad_norm": 0.8328282276106426,
"learning_rate": 0.00011309001881944809,
"loss": 11.6812,
"num_tokens": 4851705.0,
"step": 680
},
{
"epoch": 0.4758909853249476,
"grad_norm": 0.9257538169159817,
"learning_rate": 0.00011286559347956651,
"loss": 11.8385,
"num_tokens": 4858279.0,
"step": 681
},
{
"epoch": 0.47658979734451434,
"grad_norm": 0.8413337207403613,
"learning_rate": 0.00011264110222981535,
"loss": 11.6961,
"num_tokens": 4866344.0,
"step": 682
},
{
"epoch": 0.4772886093640811,
"grad_norm": 0.8649763291413212,
"learning_rate": 0.00011241654622025334,
"loss": 11.5717,
"num_tokens": 4873494.0,
"step": 683
},
{
"epoch": 0.4779874213836478,
"grad_norm": 0.8267894686349591,
"learning_rate": 0.00011219192660127116,
"loss": 11.5825,
"num_tokens": 4880904.0,
"step": 684
},
{
"epoch": 0.47868623340321453,
"grad_norm": 0.7711166963102474,
"learning_rate": 0.00011196724452358516,
"loss": 11.5718,
"num_tokens": 4888663.0,
"step": 685
},
{
"epoch": 0.47938504542278126,
"grad_norm": 0.9464532926989868,
"learning_rate": 0.00011174250113823173,
"loss": 11.5705,
"num_tokens": 4895524.0,
"step": 686
},
{
"epoch": 0.480083857442348,
"grad_norm": 0.8496053713043877,
"learning_rate": 0.00011151769759656136,
"loss": 11.5792,
"num_tokens": 4902875.0,
"step": 687
},
{
"epoch": 0.4807826694619147,
"grad_norm": 0.9687523885064169,
"learning_rate": 0.00011129283505023274,
"loss": 11.6743,
"num_tokens": 4910339.0,
"step": 688
},
{
"epoch": 0.48148148148148145,
"grad_norm": 0.8936773368209414,
"learning_rate": 0.00011106791465120678,
"loss": 11.5858,
"num_tokens": 4917859.0,
"step": 689
},
{
"epoch": 0.48218029350104824,
"grad_norm": 0.8387839584310475,
"learning_rate": 0.00011084293755174083,
"loss": 11.529,
"num_tokens": 4924845.0,
"step": 690
},
{
"epoch": 0.48287910552061497,
"grad_norm": 0.8689568841128783,
"learning_rate": 0.0001106179049043826,
"loss": 11.6106,
"num_tokens": 4932581.0,
"step": 691
},
{
"epoch": 0.4835779175401817,
"grad_norm": 0.9581051097093195,
"learning_rate": 0.00011039281786196454,
"loss": 11.5746,
"num_tokens": 4938840.0,
"step": 692
},
{
"epoch": 0.48427672955974843,
"grad_norm": 0.8586009856955368,
"learning_rate": 0.00011016767757759758,
"loss": 11.5862,
"num_tokens": 4946122.0,
"step": 693
},
{
"epoch": 0.48497554157931516,
"grad_norm": 0.8457064008914603,
"learning_rate": 0.00010994248520466555,
"loss": 11.6757,
"num_tokens": 4953325.0,
"step": 694
},
{
"epoch": 0.4856743535988819,
"grad_norm": 0.8780641814348458,
"learning_rate": 0.00010971724189681907,
"loss": 11.4986,
"num_tokens": 4960210.0,
"step": 695
},
{
"epoch": 0.4863731656184486,
"grad_norm": 0.7763581662404005,
"learning_rate": 0.00010949194880796966,
"loss": 11.4152,
"num_tokens": 4968396.0,
"step": 696
},
{
"epoch": 0.48707197763801535,
"grad_norm": 0.8525009632274685,
"learning_rate": 0.000109266607092284,
"loss": 11.6096,
"num_tokens": 4975812.0,
"step": 697
},
{
"epoch": 0.48777078965758214,
"grad_norm": 0.8011788073233526,
"learning_rate": 0.00010904121790417767,
"loss": 11.5614,
"num_tokens": 4983615.0,
"step": 698
},
{
"epoch": 0.48846960167714887,
"grad_norm": 0.8819826065813017,
"learning_rate": 0.00010881578239830965,
"loss": 11.4381,
"num_tokens": 4990664.0,
"step": 699
},
{
"epoch": 0.4891684136967156,
"grad_norm": 0.7917077959661855,
"learning_rate": 0.0001085903017295761,
"loss": 11.576,
"num_tokens": 4997991.0,
"step": 700
},
{
"epoch": 0.48986722571628233,
"grad_norm": 0.8045632066937026,
"learning_rate": 0.00010836477705310457,
"loss": 11.4527,
"num_tokens": 5005535.0,
"step": 701
},
{
"epoch": 0.49056603773584906,
"grad_norm": 0.8324889010068139,
"learning_rate": 0.00010813920952424805,
"loss": 11.4868,
"num_tokens": 5012725.0,
"step": 702
},
{
"epoch": 0.4912648497554158,
"grad_norm": 0.8737336153739056,
"learning_rate": 0.00010791360029857908,
"loss": 11.5057,
"num_tokens": 5020226.0,
"step": 703
},
{
"epoch": 0.4919636617749825,
"grad_norm": 0.8651144067674705,
"learning_rate": 0.00010768795053188378,
"loss": 11.6567,
"num_tokens": 5027841.0,
"step": 704
},
{
"epoch": 0.49266247379454925,
"grad_norm": 0.8803712717375257,
"learning_rate": 0.00010746226138015605,
"loss": 11.5934,
"num_tokens": 5034672.0,
"step": 705
},
{
"epoch": 0.493361285814116,
"grad_norm": 0.9111814847188424,
"learning_rate": 0.00010723653399959141,
"loss": 11.5609,
"num_tokens": 5041763.0,
"step": 706
},
{
"epoch": 0.49406009783368277,
"grad_norm": 0.836271613979484,
"learning_rate": 0.00010701076954658133,
"loss": 11.7879,
"num_tokens": 5048922.0,
"step": 707
},
{
"epoch": 0.4947589098532495,
"grad_norm": 0.8749158004381932,
"learning_rate": 0.00010678496917770719,
"loss": 11.6949,
"num_tokens": 5056008.0,
"step": 708
},
{
"epoch": 0.4954577218728162,
"grad_norm": 0.9232202210604769,
"learning_rate": 0.00010655913404973432,
"loss": 11.5986,
"num_tokens": 5062729.0,
"step": 709
},
{
"epoch": 0.49615653389238296,
"grad_norm": 0.813868938711914,
"learning_rate": 0.0001063332653196062,
"loss": 11.6453,
"num_tokens": 5069602.0,
"step": 710
},
{
"epoch": 0.4968553459119497,
"grad_norm": 0.9242126754852555,
"learning_rate": 0.00010610736414443836,
"loss": 11.5618,
"num_tokens": 5076146.0,
"step": 711
},
{
"epoch": 0.4975541579315164,
"grad_norm": 0.8136052479087154,
"learning_rate": 0.00010588143168151257,
"loss": 11.4639,
"num_tokens": 5083612.0,
"step": 712
},
{
"epoch": 0.49825296995108315,
"grad_norm": 0.8852326396173759,
"learning_rate": 0.00010565546908827093,
"loss": 11.4881,
"num_tokens": 5090353.0,
"step": 713
},
{
"epoch": 0.4989517819706499,
"grad_norm": 0.8821183773577141,
"learning_rate": 0.00010542947752230987,
"loss": 11.483,
"num_tokens": 5098640.0,
"step": 714
},
{
"epoch": 0.4996505939902166,
"grad_norm": 0.785007875134882,
"learning_rate": 0.00010520345814137422,
"loss": 11.5312,
"num_tokens": 5106336.0,
"step": 715
},
{
"epoch": 0.5003494060097834,
"grad_norm": 0.9266326616202625,
"learning_rate": 0.0001049774121033514,
"loss": 11.5449,
"num_tokens": 5113949.0,
"step": 716
},
{
"epoch": 0.5010482180293501,
"grad_norm": 0.9045490686422497,
"learning_rate": 0.00010475134056626521,
"loss": 11.3111,
"num_tokens": 5120936.0,
"step": 717
},
{
"epoch": 0.5017470300489169,
"grad_norm": 0.8682422987772898,
"learning_rate": 0.00010452524468827028,
"loss": 11.5875,
"num_tokens": 5127936.0,
"step": 718
},
{
"epoch": 0.5024458420684835,
"grad_norm": 0.9430827830505436,
"learning_rate": 0.00010429912562764582,
"loss": 11.5213,
"num_tokens": 5135181.0,
"step": 719
},
{
"epoch": 0.5031446540880503,
"grad_norm": 0.9079946006860932,
"learning_rate": 0.00010407298454278983,
"loss": 11.668,
"num_tokens": 5141581.0,
"step": 720
},
{
"epoch": 0.5038434661076171,
"grad_norm": 0.8275366610829485,
"learning_rate": 0.00010384682259221314,
"loss": 11.5375,
"num_tokens": 5149210.0,
"step": 721
},
{
"epoch": 0.5045422781271838,
"grad_norm": 0.8396155615656928,
"learning_rate": 0.00010362064093453347,
"loss": 11.714,
"num_tokens": 5156153.0,
"step": 722
},
{
"epoch": 0.5052410901467506,
"grad_norm": 0.896827897280713,
"learning_rate": 0.00010339444072846955,
"loss": 11.5958,
"num_tokens": 5163697.0,
"step": 723
},
{
"epoch": 0.5059399021663172,
"grad_norm": 0.8796645326364783,
"learning_rate": 0.00010316822313283503,
"loss": 11.5598,
"num_tokens": 5170939.0,
"step": 724
},
{
"epoch": 0.506638714185884,
"grad_norm": 0.8530027365550926,
"learning_rate": 0.00010294198930653273,
"loss": 11.5458,
"num_tokens": 5178058.0,
"step": 725
},
{
"epoch": 0.5073375262054507,
"grad_norm": 0.9210847197223277,
"learning_rate": 0.00010271574040854863,
"loss": 11.4003,
"num_tokens": 5185093.0,
"step": 726
},
{
"epoch": 0.5080363382250175,
"grad_norm": 1.0009241150831982,
"learning_rate": 0.00010248947759794583,
"loss": 11.7382,
"num_tokens": 5191726.0,
"step": 727
},
{
"epoch": 0.5087351502445842,
"grad_norm": 0.9845840878696487,
"learning_rate": 0.00010226320203385878,
"loss": 11.4472,
"num_tokens": 5198609.0,
"step": 728
},
{
"epoch": 0.5094339622641509,
"grad_norm": 0.9751424145383517,
"learning_rate": 0.00010203691487548721,
"loss": 11.6166,
"num_tokens": 5206060.0,
"step": 729
},
{
"epoch": 0.5101327742837177,
"grad_norm": 0.9203888539094582,
"learning_rate": 0.00010181061728209034,
"loss": 11.6607,
"num_tokens": 5213009.0,
"step": 730
},
{
"epoch": 0.5108315863032844,
"grad_norm": 0.8788149722321699,
"learning_rate": 0.00010158431041298076,
"loss": 11.7453,
"num_tokens": 5220145.0,
"step": 731
},
{
"epoch": 0.5115303983228512,
"grad_norm": 0.903757013720806,
"learning_rate": 0.00010135799542751861,
"loss": 11.597,
"num_tokens": 5226851.0,
"step": 732
},
{
"epoch": 0.5122292103424179,
"grad_norm": 1.155119292678417,
"learning_rate": 0.0001011316734851056,
"loss": 11.4747,
"num_tokens": 5234276.0,
"step": 733
},
{
"epoch": 0.5129280223619846,
"grad_norm": 0.8100458853841396,
"learning_rate": 0.00010090534574517907,
"loss": 11.419,
"num_tokens": 5241284.0,
"step": 734
},
{
"epoch": 0.5136268343815513,
"grad_norm": 1.1407647261079725,
"learning_rate": 0.00010067901336720611,
"loss": 11.391,
"num_tokens": 5248568.0,
"step": 735
},
{
"epoch": 0.5143256464011181,
"grad_norm": 0.8989248775749288,
"learning_rate": 0.00010045267751067757,
"loss": 11.818,
"num_tokens": 5255337.0,
"step": 736
},
{
"epoch": 0.5150244584206848,
"grad_norm": 0.9297104357321131,
"learning_rate": 0.00010022633933510201,
"loss": 11.4153,
"num_tokens": 5262391.0,
"step": 737
},
{
"epoch": 0.5157232704402516,
"grad_norm": 0.8678797973500714,
"learning_rate": 0.0001,
"loss": 11.628,
"num_tokens": 5270611.0,
"step": 738
},
{
"epoch": 0.5164220824598184,
"grad_norm": 0.826651401983544,
"learning_rate": 9.977366066489801e-05,
"loss": 11.4746,
"num_tokens": 5278249.0,
"step": 739
},
{
"epoch": 0.517120894479385,
"grad_norm": 0.9125187235908128,
"learning_rate": 9.954732248932244e-05,
"loss": 11.6169,
"num_tokens": 5285271.0,
"step": 740
},
{
"epoch": 0.5178197064989518,
"grad_norm": 0.8816292587170192,
"learning_rate": 9.932098663279392e-05,
"loss": 11.4734,
"num_tokens": 5292168.0,
"step": 741
},
{
"epoch": 0.5185185185185185,
"grad_norm": 0.9512003221847414,
"learning_rate": 9.909465425482093e-05,
"loss": 11.638,
"num_tokens": 5298892.0,
"step": 742
},
{
"epoch": 0.5192173305380853,
"grad_norm": 0.9084938360187235,
"learning_rate": 9.886832651489444e-05,
"loss": 11.5939,
"num_tokens": 5305743.0,
"step": 743
},
{
"epoch": 0.519916142557652,
"grad_norm": 0.8670145766679175,
"learning_rate": 9.864200457248144e-05,
"loss": 11.4224,
"num_tokens": 5313196.0,
"step": 744
},
{
"epoch": 0.5206149545772187,
"grad_norm": 0.8331174116098162,
"learning_rate": 9.841568958701924e-05,
"loss": 11.4973,
"num_tokens": 5320688.0,
"step": 745
},
{
"epoch": 0.5213137665967854,
"grad_norm": 0.8997751578998335,
"learning_rate": 9.81893827179097e-05,
"loss": 11.4589,
"num_tokens": 5328457.0,
"step": 746
},
{
"epoch": 0.5220125786163522,
"grad_norm": 0.8720648157152265,
"learning_rate": 9.796308512451284e-05,
"loss": 11.6434,
"num_tokens": 5335909.0,
"step": 747
},
{
"epoch": 0.522711390635919,
"grad_norm": 0.973933585834344,
"learning_rate": 9.773679796614124e-05,
"loss": 11.3723,
"num_tokens": 5343377.0,
"step": 748
},
{
"epoch": 0.5234102026554857,
"grad_norm": 0.8303294482427562,
"learning_rate": 9.751052240205421e-05,
"loss": 11.5721,
"num_tokens": 5350725.0,
"step": 749
},
{
"epoch": 0.5241090146750524,
"grad_norm": 0.9573042241935518,
"learning_rate": 9.728425959145139e-05,
"loss": 11.5018,
"num_tokens": 5358235.0,
"step": 750
},
{
"epoch": 0.5248078266946191,
"grad_norm": 0.8319564056885251,
"learning_rate": 9.705801069346729e-05,
"loss": 11.613,
"num_tokens": 5365590.0,
"step": 751
},
{
"epoch": 0.5255066387141859,
"grad_norm": 0.8938099740380406,
"learning_rate": 9.683177686716501e-05,
"loss": 11.4718,
"num_tokens": 5371915.0,
"step": 752
},
{
"epoch": 0.5262054507337526,
"grad_norm": 1.0236425836011982,
"learning_rate": 9.660555927153047e-05,
"loss": 11.6484,
"num_tokens": 5378290.0,
"step": 753
},
{
"epoch": 0.5269042627533194,
"grad_norm": 0.7790090582597233,
"learning_rate": 9.637935906546655e-05,
"loss": 11.4802,
"num_tokens": 5385294.0,
"step": 754
},
{
"epoch": 0.527603074772886,
"grad_norm": 0.9227942573531065,
"learning_rate": 9.615317740778689e-05,
"loss": 11.6279,
"num_tokens": 5392707.0,
"step": 755
},
{
"epoch": 0.5283018867924528,
"grad_norm": 0.8441266244096841,
"learning_rate": 9.592701545721021e-05,
"loss": 11.5781,
"num_tokens": 5400588.0,
"step": 756
},
{
"epoch": 0.5290006988120196,
"grad_norm": 0.8729177064492472,
"learning_rate": 9.570087437235423e-05,
"loss": 11.6521,
"num_tokens": 5407337.0,
"step": 757
},
{
"epoch": 0.5296995108315863,
"grad_norm": 0.8659344216472323,
"learning_rate": 9.547475531172973e-05,
"loss": 11.5359,
"num_tokens": 5414577.0,
"step": 758
},
{
"epoch": 0.5303983228511531,
"grad_norm": 0.8714852832156825,
"learning_rate": 9.524865943373481e-05,
"loss": 11.5915,
"num_tokens": 5421211.0,
"step": 759
},
{
"epoch": 0.5310971348707197,
"grad_norm": 0.9244623269796611,
"learning_rate": 9.502258789664865e-05,
"loss": 11.6818,
"num_tokens": 5427910.0,
"step": 760
},
{
"epoch": 0.5317959468902865,
"grad_norm": 0.9155444602382034,
"learning_rate": 9.479654185862579e-05,
"loss": 11.6422,
"num_tokens": 5434742.0,
"step": 761
},
{
"epoch": 0.5324947589098532,
"grad_norm": 0.8219424880205499,
"learning_rate": 9.457052247769017e-05,
"loss": 11.5446,
"num_tokens": 5441794.0,
"step": 762
},
{
"epoch": 0.53319357092942,
"grad_norm": 0.7811646495161321,
"learning_rate": 9.434453091172908e-05,
"loss": 11.4499,
"num_tokens": 5449915.0,
"step": 763
},
{
"epoch": 0.5338923829489868,
"grad_norm": 0.8952879445607002,
"learning_rate": 9.411856831848745e-05,
"loss": 11.4182,
"num_tokens": 5456843.0,
"step": 764
},
{
"epoch": 0.5345911949685535,
"grad_norm": 0.8151057656551013,
"learning_rate": 9.38926358555617e-05,
"loss": 11.5544,
"num_tokens": 5464023.0,
"step": 765
},
{
"epoch": 0.5352900069881202,
"grad_norm": 0.8620924745150299,
"learning_rate": 9.366673468039383e-05,
"loss": 11.7123,
"num_tokens": 5471406.0,
"step": 766
},
{
"epoch": 0.5359888190076869,
"grad_norm": 0.9188647725830821,
"learning_rate": 9.34408659502657e-05,
"loss": 11.5564,
"num_tokens": 5478339.0,
"step": 767
},
{
"epoch": 0.5366876310272537,
"grad_norm": 0.8321100069995644,
"learning_rate": 9.321503082229282e-05,
"loss": 11.4913,
"num_tokens": 5485853.0,
"step": 768
},
{
"epoch": 0.5373864430468204,
"grad_norm": 0.8562467427898608,
"learning_rate": 9.298923045341869e-05,
"loss": 11.453,
"num_tokens": 5493547.0,
"step": 769
},
{
"epoch": 0.5380852550663872,
"grad_norm": 0.9138670376872613,
"learning_rate": 9.276346600040862e-05,
"loss": 11.2109,
"num_tokens": 5500515.0,
"step": 770
},
{
"epoch": 0.5387840670859538,
"grad_norm": 0.9006556795724268,
"learning_rate": 9.253773861984397e-05,
"loss": 11.4139,
"num_tokens": 5507599.0,
"step": 771
},
{
"epoch": 0.5394828791055206,
"grad_norm": 0.8553530215170654,
"learning_rate": 9.231204946811624e-05,
"loss": 11.5513,
"num_tokens": 5514593.0,
"step": 772
},
{
"epoch": 0.5401816911250874,
"grad_norm": 0.9148080781609712,
"learning_rate": 9.208639970142093e-05,
"loss": 11.5843,
"num_tokens": 5521763.0,
"step": 773
},
{
"epoch": 0.5408805031446541,
"grad_norm": 0.7554532212539814,
"learning_rate": 9.186079047575197e-05,
"loss": 11.5373,
"num_tokens": 5529580.0,
"step": 774
},
{
"epoch": 0.5415793151642209,
"grad_norm": 0.8307938551299585,
"learning_rate": 9.163522294689546e-05,
"loss": 11.461,
"num_tokens": 5536873.0,
"step": 775
},
{
"epoch": 0.5422781271837875,
"grad_norm": 0.9281257673464066,
"learning_rate": 9.140969827042391e-05,
"loss": 11.5544,
"num_tokens": 5543550.0,
"step": 776
},
{
"epoch": 0.5429769392033543,
"grad_norm": 0.8120164780002868,
"learning_rate": 9.118421760169038e-05,
"loss": 11.7136,
"num_tokens": 5550884.0,
"step": 777
},
{
"epoch": 0.543675751222921,
"grad_norm": 0.8836373379698687,
"learning_rate": 9.095878209582237e-05,
"loss": 11.386,
"num_tokens": 5557807.0,
"step": 778
},
{
"epoch": 0.5443745632424878,
"grad_norm": 0.9297380134031583,
"learning_rate": 9.073339290771603e-05,
"loss": 11.5867,
"num_tokens": 5564576.0,
"step": 779
},
{
"epoch": 0.5450733752620545,
"grad_norm": 0.7995838496582022,
"learning_rate": 9.050805119203035e-05,
"loss": 11.4059,
"num_tokens": 5572504.0,
"step": 780
},
{
"epoch": 0.5457721872816212,
"grad_norm": 0.8264976985889633,
"learning_rate": 9.028275810318095e-05,
"loss": 11.3345,
"num_tokens": 5579720.0,
"step": 781
},
{
"epoch": 0.546470999301188,
"grad_norm": 0.8302780963555162,
"learning_rate": 9.005751479533449e-05,
"loss": 11.5461,
"num_tokens": 5586866.0,
"step": 782
},
{
"epoch": 0.5471698113207547,
"grad_norm": 0.8842323310444606,
"learning_rate": 8.983232242240247e-05,
"loss": 11.4853,
"num_tokens": 5593273.0,
"step": 783
},
{
"epoch": 0.5478686233403215,
"grad_norm": 0.8079761026322886,
"learning_rate": 8.96071821380355e-05,
"loss": 11.6445,
"num_tokens": 5600825.0,
"step": 784
},
{
"epoch": 0.5485674353598882,
"grad_norm": 0.7976472224058269,
"learning_rate": 8.938209509561741e-05,
"loss": 11.5162,
"num_tokens": 5608068.0,
"step": 785
},
{
"epoch": 0.549266247379455,
"grad_norm": 0.7844336006627589,
"learning_rate": 8.91570624482592e-05,
"loss": 11.5521,
"num_tokens": 5615637.0,
"step": 786
},
{
"epoch": 0.5499650593990216,
"grad_norm": 0.8546739107820156,
"learning_rate": 8.893208534879324e-05,
"loss": 11.4099,
"num_tokens": 5622822.0,
"step": 787
},
{
"epoch": 0.5506638714185884,
"grad_norm": 0.7935052239619794,
"learning_rate": 8.87071649497673e-05,
"loss": 11.4709,
"num_tokens": 5629785.0,
"step": 788
},
{
"epoch": 0.5513626834381551,
"grad_norm": 0.8364823960249244,
"learning_rate": 8.848230240343865e-05,
"loss": 11.5328,
"num_tokens": 5636551.0,
"step": 789
},
{
"epoch": 0.5520614954577219,
"grad_norm": 0.8504654444826972,
"learning_rate": 8.82574988617683e-05,
"loss": 11.4773,
"num_tokens": 5643210.0,
"step": 790
},
{
"epoch": 0.5527603074772887,
"grad_norm": 0.8185493182608572,
"learning_rate": 8.803275547641488e-05,
"loss": 11.3797,
"num_tokens": 5650927.0,
"step": 791
},
{
"epoch": 0.5534591194968553,
"grad_norm": 0.7875124927027887,
"learning_rate": 8.780807339872886e-05,
"loss": 11.6004,
"num_tokens": 5658354.0,
"step": 792
},
{
"epoch": 0.5541579315164221,
"grad_norm": 0.9390755326975335,
"learning_rate": 8.758345377974667e-05,
"loss": 11.7106,
"num_tokens": 5664499.0,
"step": 793
},
{
"epoch": 0.5548567435359888,
"grad_norm": 0.8744733379949224,
"learning_rate": 8.735889777018465e-05,
"loss": 11.5987,
"num_tokens": 5670823.0,
"step": 794
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.7936257103943727,
"learning_rate": 8.71344065204335e-05,
"loss": 11.3932,
"num_tokens": 5678464.0,
"step": 795
},
{
"epoch": 0.5562543675751223,
"grad_norm": 0.9012746970901263,
"learning_rate": 8.690998118055193e-05,
"loss": 11.4493,
"num_tokens": 5685313.0,
"step": 796
},
{
"epoch": 0.556953179594689,
"grad_norm": 0.7949308089769119,
"learning_rate": 8.66856229002611e-05,
"loss": 11.6037,
"num_tokens": 5692972.0,
"step": 797
},
{
"epoch": 0.5576519916142557,
"grad_norm": 0.8986243273374567,
"learning_rate": 8.646133282893864e-05,
"loss": 11.4562,
"num_tokens": 5699327.0,
"step": 798
},
{
"epoch": 0.5583508036338225,
"grad_norm": 0.8149788916712181,
"learning_rate": 8.623711211561267e-05,
"loss": 11.4127,
"num_tokens": 5706702.0,
"step": 799
},
{
"epoch": 0.5590496156533893,
"grad_norm": 0.8205025674176395,
"learning_rate": 8.601296190895611e-05,
"loss": 11.5586,
"num_tokens": 5713918.0,
"step": 800
},
{
"epoch": 0.559748427672956,
"grad_norm": 0.9188648076919179,
"learning_rate": 8.578888335728057e-05,
"loss": 11.4676,
"num_tokens": 5721220.0,
"step": 801
},
{
"epoch": 0.5604472396925227,
"grad_norm": 0.8501915802973017,
"learning_rate": 8.55648776085306e-05,
"loss": 11.4786,
"num_tokens": 5728492.0,
"step": 802
},
{
"epoch": 0.5611460517120894,
"grad_norm": 0.8285958870852815,
"learning_rate": 8.534094581027785e-05,
"loss": 11.6043,
"num_tokens": 5735983.0,
"step": 803
},
{
"epoch": 0.5618448637316562,
"grad_norm": 0.944523071088074,
"learning_rate": 8.511708910971505e-05,
"loss": 11.5251,
"num_tokens": 5742678.0,
"step": 804
},
{
"epoch": 0.5625436757512229,
"grad_norm": 0.8081306301004181,
"learning_rate": 8.489330865365018e-05,
"loss": 11.4547,
"num_tokens": 5750553.0,
"step": 805
},
{
"epoch": 0.5632424877707897,
"grad_norm": 0.7754025315598624,
"learning_rate": 8.466960558850077e-05,
"loss": 11.2548,
"num_tokens": 5758022.0,
"step": 806
},
{
"epoch": 0.5639412997903563,
"grad_norm": 0.8196770260099395,
"learning_rate": 8.444598106028773e-05,
"loss": 11.4266,
"num_tokens": 5765255.0,
"step": 807
},
{
"epoch": 0.5646401118099231,
"grad_norm": 0.8107362785471051,
"learning_rate": 8.422243621462969e-05,
"loss": 11.4811,
"num_tokens": 5772746.0,
"step": 808
},
{
"epoch": 0.5653389238294899,
"grad_norm": 0.7690551203928506,
"learning_rate": 8.399897219673709e-05,
"loss": 11.3748,
"num_tokens": 5779819.0,
"step": 809
},
{
"epoch": 0.5660377358490566,
"grad_norm": 0.7912532894956382,
"learning_rate": 8.37755901514062e-05,
"loss": 11.4268,
"num_tokens": 5787293.0,
"step": 810
},
{
"epoch": 0.5667365478686234,
"grad_norm": 0.7530699743645662,
"learning_rate": 8.355229122301348e-05,
"loss": 11.4364,
"num_tokens": 5795358.0,
"step": 811
},
{
"epoch": 0.56743535988819,
"grad_norm": 0.8641916957536385,
"learning_rate": 8.332907655550948e-05,
"loss": 11.565,
"num_tokens": 5802192.0,
"step": 812
},
{
"epoch": 0.5681341719077568,
"grad_norm": 0.874534688203229,
"learning_rate": 8.310594729241317e-05,
"loss": 11.6021,
"num_tokens": 5809107.0,
"step": 813
},
{
"epoch": 0.5688329839273235,
"grad_norm": 0.7922582476273456,
"learning_rate": 8.288290457680591e-05,
"loss": 11.3441,
"num_tokens": 5815896.0,
"step": 814
},
{
"epoch": 0.5695317959468903,
"grad_norm": 0.8349253188255037,
"learning_rate": 8.265994955132572e-05,
"loss": 11.6314,
"num_tokens": 5823124.0,
"step": 815
},
{
"epoch": 0.570230607966457,
"grad_norm": 0.8699624382696525,
"learning_rate": 8.243708335816145e-05,
"loss": 11.6838,
"num_tokens": 5829951.0,
"step": 816
},
{
"epoch": 0.5709294199860238,
"grad_norm": 0.8100979857916469,
"learning_rate": 8.221430713904672e-05,
"loss": 11.3424,
"num_tokens": 5836842.0,
"step": 817
},
{
"epoch": 0.5716282320055905,
"grad_norm": 0.8477977602214554,
"learning_rate": 8.19916220352544e-05,
"loss": 11.5169,
"num_tokens": 5844342.0,
"step": 818
},
{
"epoch": 0.5723270440251572,
"grad_norm": 0.7768417118848057,
"learning_rate": 8.176902918759041e-05,
"loss": 11.4596,
"num_tokens": 5851608.0,
"step": 819
},
{
"epoch": 0.573025856044724,
"grad_norm": 0.8145926096488002,
"learning_rate": 8.15465297363881e-05,
"loss": 11.3234,
"num_tokens": 5858938.0,
"step": 820
},
{
"epoch": 0.5737246680642907,
"grad_norm": 0.8306120383140485,
"learning_rate": 8.132412482150245e-05,
"loss": 11.4715,
"num_tokens": 5865983.0,
"step": 821
},
{
"epoch": 0.5744234800838575,
"grad_norm": 0.8432187258645947,
"learning_rate": 8.110181558230404e-05,
"loss": 11.5188,
"num_tokens": 5872600.0,
"step": 822
},
{
"epoch": 0.5751222921034241,
"grad_norm": 0.913675498107121,
"learning_rate": 8.087960315767328e-05,
"loss": 11.4379,
"num_tokens": 5878398.0,
"step": 823
},
{
"epoch": 0.5758211041229909,
"grad_norm": 0.7767964555554504,
"learning_rate": 8.06574886859947e-05,
"loss": 11.5975,
"num_tokens": 5886022.0,
"step": 824
},
{
"epoch": 0.5765199161425576,
"grad_norm": 0.8067890059558731,
"learning_rate": 8.043547330515092e-05,
"loss": 11.417,
"num_tokens": 5893141.0,
"step": 825
},
{
"epoch": 0.5772187281621244,
"grad_norm": 0.7778204138902519,
"learning_rate": 8.021355815251703e-05,
"loss": 11.4109,
"num_tokens": 5900162.0,
"step": 826
},
{
"epoch": 0.5779175401816912,
"grad_norm": 0.8581915102507534,
"learning_rate": 7.999174436495456e-05,
"loss": 11.3281,
"num_tokens": 5907169.0,
"step": 827
},
{
"epoch": 0.5786163522012578,
"grad_norm": 0.7943071251832075,
"learning_rate": 7.97700330788058e-05,
"loss": 11.5825,
"num_tokens": 5914949.0,
"step": 828
},
{
"epoch": 0.5793151642208246,
"grad_norm": 0.8307994124468097,
"learning_rate": 7.954842542988792e-05,
"loss": 11.5205,
"num_tokens": 5921813.0,
"step": 829
},
{
"epoch": 0.5800139762403913,
"grad_norm": 0.7956011180143502,
"learning_rate": 7.932692255348711e-05,
"loss": 11.6589,
"num_tokens": 5929467.0,
"step": 830
},
{
"epoch": 0.5807127882599581,
"grad_norm": 0.751578867815494,
"learning_rate": 7.910552558435297e-05,
"loss": 11.3462,
"num_tokens": 5936756.0,
"step": 831
},
{
"epoch": 0.5814116002795248,
"grad_norm": 0.7268812168656755,
"learning_rate": 7.888423565669236e-05,
"loss": 11.4221,
"num_tokens": 5944825.0,
"step": 832
},
{
"epoch": 0.5821104122990916,
"grad_norm": 0.8006177593002789,
"learning_rate": 7.866305390416385e-05,
"loss": 11.4256,
"num_tokens": 5951987.0,
"step": 833
},
{
"epoch": 0.5828092243186582,
"grad_norm": 0.7402596515700245,
"learning_rate": 7.844198145987187e-05,
"loss": 11.3692,
"num_tokens": 5959236.0,
"step": 834
},
{
"epoch": 0.583508036338225,
"grad_norm": 0.8602548653263015,
"learning_rate": 7.82210194563608e-05,
"loss": 11.7315,
"num_tokens": 5966125.0,
"step": 835
},
{
"epoch": 0.5842068483577918,
"grad_norm": 0.7749004598919897,
"learning_rate": 7.800016902560924e-05,
"loss": 11.4858,
"num_tokens": 5974067.0,
"step": 836
},
{
"epoch": 0.5849056603773585,
"grad_norm": 0.833537255516446,
"learning_rate": 7.77794312990243e-05,
"loss": 11.4515,
"num_tokens": 5980876.0,
"step": 837
},
{
"epoch": 0.5856044723969253,
"grad_norm": 0.8541885917387239,
"learning_rate": 7.755880740743559e-05,
"loss": 11.6651,
"num_tokens": 5988494.0,
"step": 838
},
{
"epoch": 0.5863032844164919,
"grad_norm": 0.8342950784145347,
"learning_rate": 7.733829848108965e-05,
"loss": 11.3994,
"num_tokens": 5995555.0,
"step": 839
},
{
"epoch": 0.5870020964360587,
"grad_norm": 0.7862902039549512,
"learning_rate": 7.7117905649644e-05,
"loss": 11.3371,
"num_tokens": 6002655.0,
"step": 840
},
{
"epoch": 0.5877009084556254,
"grad_norm": 0.8543777698570504,
"learning_rate": 7.689763004216135e-05,
"loss": 11.5781,
"num_tokens": 6009185.0,
"step": 841
},
{
"epoch": 0.5883997204751922,
"grad_norm": 0.9108297671558842,
"learning_rate": 7.667747278710406e-05,
"loss": 11.2922,
"num_tokens": 6016823.0,
"step": 842
},
{
"epoch": 0.589098532494759,
"grad_norm": 0.7511611223234017,
"learning_rate": 7.6457435012328e-05,
"loss": 11.4936,
"num_tokens": 6024771.0,
"step": 843
},
{
"epoch": 0.5897973445143256,
"grad_norm": 0.8464284988171904,
"learning_rate": 7.623751784507706e-05,
"loss": 11.4268,
"num_tokens": 6031975.0,
"step": 844
},
{
"epoch": 0.5904961565338924,
"grad_norm": 0.8292138849138506,
"learning_rate": 7.601772241197719e-05,
"loss": 11.6141,
"num_tokens": 6039124.0,
"step": 845
},
{
"epoch": 0.5911949685534591,
"grad_norm": 0.8644997646527984,
"learning_rate": 7.579804983903067e-05,
"loss": 11.6254,
"num_tokens": 6045752.0,
"step": 846
},
{
"epoch": 0.5918937805730259,
"grad_norm": 0.9065613000680274,
"learning_rate": 7.557850125161053e-05,
"loss": 11.5652,
"num_tokens": 6052231.0,
"step": 847
},
{
"epoch": 0.5925925925925926,
"grad_norm": 0.8567835900522591,
"learning_rate": 7.535907777445449e-05,
"loss": 11.3823,
"num_tokens": 6059886.0,
"step": 848
},
{
"epoch": 0.5932914046121593,
"grad_norm": 0.8345066461262305,
"learning_rate": 7.513978053165934e-05,
"loss": 11.6205,
"num_tokens": 6066548.0,
"step": 849
},
{
"epoch": 0.593990216631726,
"grad_norm": 0.8056119012251634,
"learning_rate": 7.492061064667526e-05,
"loss": 11.4517,
"num_tokens": 6074097.0,
"step": 850
},
{
"epoch": 0.5946890286512928,
"grad_norm": 0.9106299776518808,
"learning_rate": 7.470156924229988e-05,
"loss": 11.524,
"num_tokens": 6080717.0,
"step": 851
},
{
"epoch": 0.5953878406708596,
"grad_norm": 0.7853436652395662,
"learning_rate": 7.448265744067275e-05,
"loss": 11.3629,
"num_tokens": 6088678.0,
"step": 852
},
{
"epoch": 0.5960866526904263,
"grad_norm": 0.8446726817091805,
"learning_rate": 7.426387636326936e-05,
"loss": 11.5285,
"num_tokens": 6095534.0,
"step": 853
},
{
"epoch": 0.596785464709993,
"grad_norm": 0.8096739560163518,
"learning_rate": 7.404522713089554e-05,
"loss": 11.5661,
"num_tokens": 6102965.0,
"step": 854
},
{
"epoch": 0.5974842767295597,
"grad_norm": 0.8368927033929309,
"learning_rate": 7.382671086368172e-05,
"loss": 11.2703,
"num_tokens": 6109857.0,
"step": 855
},
{
"epoch": 0.5981830887491265,
"grad_norm": 0.9545515909179914,
"learning_rate": 7.360832868107708e-05,
"loss": 11.8182,
"num_tokens": 6116670.0,
"step": 856
},
{
"epoch": 0.5988819007686932,
"grad_norm": 0.8061669184109826,
"learning_rate": 7.3390081701844e-05,
"loss": 11.2728,
"num_tokens": 6123896.0,
"step": 857
},
{
"epoch": 0.59958071278826,
"grad_norm": 0.8533029486109188,
"learning_rate": 7.317197104405213e-05,
"loss": 11.5543,
"num_tokens": 6130750.0,
"step": 858
},
{
"epoch": 0.6002795248078266,
"grad_norm": 0.8969968982568871,
"learning_rate": 7.295399782507275e-05,
"loss": 11.3407,
"num_tokens": 6137850.0,
"step": 859
},
{
"epoch": 0.6009783368273934,
"grad_norm": 0.8544932603225329,
"learning_rate": 7.273616316157312e-05,
"loss": 11.3906,
"num_tokens": 6144967.0,
"step": 860
},
{
"epoch": 0.6016771488469602,
"grad_norm": 0.9575871493500785,
"learning_rate": 7.251846816951063e-05,
"loss": 11.3528,
"num_tokens": 6151858.0,
"step": 861
},
{
"epoch": 0.6023759608665269,
"grad_norm": 0.9034387927043012,
"learning_rate": 7.23009139641271e-05,
"loss": 11.4893,
"num_tokens": 6158407.0,
"step": 862
},
{
"epoch": 0.6030747728860937,
"grad_norm": 0.8300032020729585,
"learning_rate": 7.208350165994325e-05,
"loss": 11.6586,
"num_tokens": 6165454.0,
"step": 863
},
{
"epoch": 0.6037735849056604,
"grad_norm": 0.9613981820130882,
"learning_rate": 7.186623237075265e-05,
"loss": 11.4186,
"num_tokens": 6172278.0,
"step": 864
},
{
"epoch": 0.6044723969252271,
"grad_norm": 0.8216153509086527,
"learning_rate": 7.16491072096164e-05,
"loss": 11.5343,
"num_tokens": 6179334.0,
"step": 865
},
{
"epoch": 0.6051712089447938,
"grad_norm": 0.8120991028042943,
"learning_rate": 7.143212728885714e-05,
"loss": 11.4413,
"num_tokens": 6186949.0,
"step": 866
},
{
"epoch": 0.6058700209643606,
"grad_norm": 0.8725448199985772,
"learning_rate": 7.121529372005335e-05,
"loss": 11.5006,
"num_tokens": 6194084.0,
"step": 867
},
{
"epoch": 0.6065688329839273,
"grad_norm": 0.7857788374163461,
"learning_rate": 7.099860761403403e-05,
"loss": 11.5642,
"num_tokens": 6201312.0,
"step": 868
},
{
"epoch": 0.6072676450034941,
"grad_norm": 0.749546501349823,
"learning_rate": 7.078207008087248e-05,
"loss": 11.5207,
"num_tokens": 6208547.0,
"step": 869
},
{
"epoch": 0.6079664570230608,
"grad_norm": 0.807549031552495,
"learning_rate": 7.056568222988099e-05,
"loss": 11.5615,
"num_tokens": 6215891.0,
"step": 870
},
{
"epoch": 0.6086652690426275,
"grad_norm": 0.7753534051289811,
"learning_rate": 7.034944516960498e-05,
"loss": 11.4658,
"num_tokens": 6223503.0,
"step": 871
},
{
"epoch": 0.6093640810621943,
"grad_norm": 0.7732259838681556,
"learning_rate": 7.013336000781738e-05,
"loss": 11.5801,
"num_tokens": 6230239.0,
"step": 872
},
{
"epoch": 0.610062893081761,
"grad_norm": 0.7574853840801833,
"learning_rate": 6.991742785151305e-05,
"loss": 11.3924,
"num_tokens": 6237658.0,
"step": 873
},
{
"epoch": 0.6107617051013278,
"grad_norm": 0.8360871413852982,
"learning_rate": 6.970164980690285e-05,
"loss": 11.5716,
"num_tokens": 6244558.0,
"step": 874
},
{
"epoch": 0.6114605171208944,
"grad_norm": 0.8199482824692478,
"learning_rate": 6.94860269794083e-05,
"loss": 11.5512,
"num_tokens": 6251785.0,
"step": 875
},
{
"epoch": 0.6121593291404612,
"grad_norm": 0.7837266750011456,
"learning_rate": 6.927056047365557e-05,
"loss": 11.3226,
"num_tokens": 6258748.0,
"step": 876
},
{
"epoch": 0.6128581411600279,
"grad_norm": 0.7577779156383097,
"learning_rate": 6.905525139347011e-05,
"loss": 11.494,
"num_tokens": 6266062.0,
"step": 877
},
{
"epoch": 0.6135569531795947,
"grad_norm": 0.8052344965652813,
"learning_rate": 6.884010084187093e-05,
"loss": 11.427,
"num_tokens": 6272639.0,
"step": 878
},
{
"epoch": 0.6142557651991615,
"grad_norm": 0.8325818177093272,
"learning_rate": 6.86251099210648e-05,
"loss": 11.5747,
"num_tokens": 6279204.0,
"step": 879
},
{
"epoch": 0.6149545772187281,
"grad_norm": 0.7754376173157712,
"learning_rate": 6.841027973244076e-05,
"loss": 11.4026,
"num_tokens": 6286372.0,
"step": 880
},
{
"epoch": 0.6156533892382949,
"grad_norm": 0.798756729619339,
"learning_rate": 6.819561137656443e-05,
"loss": 11.3876,
"num_tokens": 6292809.0,
"step": 881
},
{
"epoch": 0.6163522012578616,
"grad_norm": 0.7719865569386571,
"learning_rate": 6.798110595317229e-05,
"loss": 11.4415,
"num_tokens": 6300059.0,
"step": 882
},
{
"epoch": 0.6170510132774284,
"grad_norm": 0.8195039075085477,
"learning_rate": 6.776676456116629e-05,
"loss": 11.5411,
"num_tokens": 6306992.0,
"step": 883
},
{
"epoch": 0.6177498252969951,
"grad_norm": 0.8027317976901461,
"learning_rate": 6.755258829860791e-05,
"loss": 11.3557,
"num_tokens": 6313854.0,
"step": 884
},
{
"epoch": 0.6184486373165619,
"grad_norm": 0.771851720603674,
"learning_rate": 6.733857826271271e-05,
"loss": 11.369,
"num_tokens": 6320866.0,
"step": 885
},
{
"epoch": 0.6191474493361285,
"grad_norm": 0.7674659928459595,
"learning_rate": 6.712473554984472e-05,
"loss": 11.3876,
"num_tokens": 6328614.0,
"step": 886
},
{
"epoch": 0.6198462613556953,
"grad_norm": 0.7600859890865244,
"learning_rate": 6.69110612555107e-05,
"loss": 11.4883,
"num_tokens": 6336010.0,
"step": 887
},
{
"epoch": 0.6205450733752621,
"grad_norm": 0.8243419809913102,
"learning_rate": 6.669755647435474e-05,
"loss": 11.3781,
"num_tokens": 6342374.0,
"step": 888
},
{
"epoch": 0.6212438853948288,
"grad_norm": 0.7829044387907746,
"learning_rate": 6.648422230015242e-05,
"loss": 11.3094,
"num_tokens": 6349581.0,
"step": 889
},
{
"epoch": 0.6219426974143956,
"grad_norm": 0.8205152326848861,
"learning_rate": 6.627105982580528e-05,
"loss": 11.4789,
"num_tokens": 6356441.0,
"step": 890
},
{
"epoch": 0.6226415094339622,
"grad_norm": 0.8121937689718386,
"learning_rate": 6.605807014333538e-05,
"loss": 11.3885,
"num_tokens": 6363148.0,
"step": 891
},
{
"epoch": 0.623340321453529,
"grad_norm": 0.8187543353581098,
"learning_rate": 6.584525434387944e-05,
"loss": 11.367,
"num_tokens": 6370176.0,
"step": 892
},
{
"epoch": 0.6240391334730957,
"grad_norm": 0.8189881007250471,
"learning_rate": 6.563261351768345e-05,
"loss": 11.6037,
"num_tokens": 6377384.0,
"step": 893
},
{
"epoch": 0.6247379454926625,
"grad_norm": 0.7403030747434303,
"learning_rate": 6.542014875409703e-05,
"loss": 11.3652,
"num_tokens": 6384876.0,
"step": 894
},
{
"epoch": 0.6254367575122292,
"grad_norm": 0.8179439653721886,
"learning_rate": 6.52078611415678e-05,
"loss": 11.3852,
"num_tokens": 6391664.0,
"step": 895
},
{
"epoch": 0.6261355695317959,
"grad_norm": 0.8515340307528421,
"learning_rate": 6.49957517676359e-05,
"loss": 11.3703,
"num_tokens": 6398231.0,
"step": 896
},
{
"epoch": 0.6268343815513627,
"grad_norm": 0.8288077296988552,
"learning_rate": 6.47838217189283e-05,
"loss": 11.353,
"num_tokens": 6405010.0,
"step": 897
},
{
"epoch": 0.6275331935709294,
"grad_norm": 0.730129626547827,
"learning_rate": 6.457207208115328e-05,
"loss": 11.5185,
"num_tokens": 6412293.0,
"step": 898
},
{
"epoch": 0.6282320055904962,
"grad_norm": 0.8202972201931359,
"learning_rate": 6.436050393909499e-05,
"loss": 11.5474,
"num_tokens": 6419190.0,
"step": 899
},
{
"epoch": 0.6289308176100629,
"grad_norm": 0.7830504737205021,
"learning_rate": 6.414911837660768e-05,
"loss": 11.1973,
"num_tokens": 6426719.0,
"step": 900
},
{
"epoch": 0.6296296296296297,
"grad_norm": 0.7530937085031525,
"learning_rate": 6.393791647661032e-05,
"loss": 11.5101,
"num_tokens": 6434060.0,
"step": 901
},
{
"epoch": 0.6303284416491963,
"grad_norm": 0.7906021221246778,
"learning_rate": 6.372689932108091e-05,
"loss": 11.5279,
"num_tokens": 6441144.0,
"step": 902
},
{
"epoch": 0.6310272536687631,
"grad_norm": 0.8223449747267925,
"learning_rate": 6.351606799105107e-05,
"loss": 11.2108,
"num_tokens": 6448000.0,
"step": 903
},
{
"epoch": 0.6317260656883298,
"grad_norm": 0.7078365936333572,
"learning_rate": 6.330542356660046e-05,
"loss": 11.4398,
"num_tokens": 6455854.0,
"step": 904
},
{
"epoch": 0.6324248777078966,
"grad_norm": 0.8055321988221168,
"learning_rate": 6.309496712685122e-05,
"loss": 11.3534,
"num_tokens": 6462687.0,
"step": 905
},
{
"epoch": 0.6331236897274634,
"grad_norm": 0.8080504596422721,
"learning_rate": 6.288469974996234e-05,
"loss": 11.4418,
"num_tokens": 6469302.0,
"step": 906
},
{
"epoch": 0.63382250174703,
"grad_norm": 0.8806667746234638,
"learning_rate": 6.267462251312445e-05,
"loss": 11.4176,
"num_tokens": 6475494.0,
"step": 907
},
{
"epoch": 0.6345213137665968,
"grad_norm": 0.7642307341098185,
"learning_rate": 6.24647364925539e-05,
"loss": 11.399,
"num_tokens": 6483132.0,
"step": 908
},
{
"epoch": 0.6352201257861635,
"grad_norm": 0.7753132086477652,
"learning_rate": 6.225504276348766e-05,
"loss": 11.2906,
"num_tokens": 6490796.0,
"step": 909
},
{
"epoch": 0.6359189378057303,
"grad_norm": 0.7486646132302851,
"learning_rate": 6.204554240017742e-05,
"loss": 11.352,
"num_tokens": 6498706.0,
"step": 910
},
{
"epoch": 0.636617749825297,
"grad_norm": 0.778566234919517,
"learning_rate": 6.183623647588427e-05,
"loss": 11.5678,
"num_tokens": 6505674.0,
"step": 911
},
{
"epoch": 0.6373165618448637,
"grad_norm": 0.9101437765550461,
"learning_rate": 6.162712606287335e-05,
"loss": 11.384,
"num_tokens": 6512122.0,
"step": 912
},
{
"epoch": 0.6380153738644304,
"grad_norm": 0.791821933199024,
"learning_rate": 6.141821223240804e-05,
"loss": 11.3918,
"num_tokens": 6519491.0,
"step": 913
},
{
"epoch": 0.6387141858839972,
"grad_norm": 0.8037666912694112,
"learning_rate": 6.120949605474478e-05,
"loss": 11.5467,
"num_tokens": 6526251.0,
"step": 914
},
{
"epoch": 0.639412997903564,
"grad_norm": 0.8390371657278727,
"learning_rate": 6.100097859912732e-05,
"loss": 11.4776,
"num_tokens": 6532851.0,
"step": 915
},
{
"epoch": 0.6401118099231307,
"grad_norm": 0.8884662450896984,
"learning_rate": 6.0792660933781375e-05,
"loss": 11.3619,
"num_tokens": 6539983.0,
"step": 916
},
{
"epoch": 0.6408106219426974,
"grad_norm": 0.7726014803170672,
"learning_rate": 6.058454412590928e-05,
"loss": 11.3864,
"num_tokens": 6547267.0,
"step": 917
},
{
"epoch": 0.6415094339622641,
"grad_norm": 0.8533702770606022,
"learning_rate": 6.037662924168419e-05,
"loss": 11.2928,
"num_tokens": 6554111.0,
"step": 918
},
{
"epoch": 0.6422082459818309,
"grad_norm": 0.8468889356156204,
"learning_rate": 6.016891734624501e-05,
"loss": 11.4525,
"num_tokens": 6561069.0,
"step": 919
},
{
"epoch": 0.6429070580013976,
"grad_norm": 0.8612184509336287,
"learning_rate": 5.9961409503690605e-05,
"loss": 11.6099,
"num_tokens": 6568238.0,
"step": 920
},
{
"epoch": 0.6436058700209644,
"grad_norm": 0.7434062790975529,
"learning_rate": 5.975410677707447e-05,
"loss": 11.4411,
"num_tokens": 6575880.0,
"step": 921
},
{
"epoch": 0.6443046820405312,
"grad_norm": 0.8174217364264119,
"learning_rate": 5.954701022839944e-05,
"loss": 11.4125,
"num_tokens": 6582676.0,
"step": 922
},
{
"epoch": 0.6450034940600978,
"grad_norm": 0.7558016655031929,
"learning_rate": 5.9340120918611994e-05,
"loss": 11.6664,
"num_tokens": 6590130.0,
"step": 923
},
{
"epoch": 0.6457023060796646,
"grad_norm": 0.7940056450454165,
"learning_rate": 5.913343990759695e-05,
"loss": 11.5493,
"num_tokens": 6596815.0,
"step": 924
},
{
"epoch": 0.6464011180992313,
"grad_norm": 0.7726837389032853,
"learning_rate": 5.8926968254172076e-05,
"loss": 11.1489,
"num_tokens": 6604041.0,
"step": 925
},
{
"epoch": 0.6470999301187981,
"grad_norm": 0.823514945514921,
"learning_rate": 5.872070701608251e-05,
"loss": 11.4563,
"num_tokens": 6611449.0,
"step": 926
},
{
"epoch": 0.6477987421383647,
"grad_norm": 0.7998018031164993,
"learning_rate": 5.851465724999559e-05,
"loss": 11.55,
"num_tokens": 6618417.0,
"step": 927
},
{
"epoch": 0.6484975541579315,
"grad_norm": 0.7462225767654096,
"learning_rate": 5.830882001149517e-05,
"loss": 11.3976,
"num_tokens": 6626409.0,
"step": 928
},
{
"epoch": 0.6491963661774982,
"grad_norm": 0.7942705495379757,
"learning_rate": 5.8103196355076305e-05,
"loss": 11.464,
"num_tokens": 6633421.0,
"step": 929
},
{
"epoch": 0.649895178197065,
"grad_norm": 0.7533581744429543,
"learning_rate": 5.789778733414004e-05,
"loss": 11.4489,
"num_tokens": 6641387.0,
"step": 930
},
{
"epoch": 0.6505939902166318,
"grad_norm": 0.7765742186173611,
"learning_rate": 5.769259400098769e-05,
"loss": 11.2764,
"num_tokens": 6648880.0,
"step": 931
},
{
"epoch": 0.6512928022361985,
"grad_norm": 0.7416483184978369,
"learning_rate": 5.748761740681573e-05,
"loss": 11.4409,
"num_tokens": 6656518.0,
"step": 932
},
{
"epoch": 0.6519916142557652,
"grad_norm": 0.7462623775574179,
"learning_rate": 5.728285860171021e-05,
"loss": 11.315,
"num_tokens": 6663873.0,
"step": 933
},
{
"epoch": 0.6526904262753319,
"grad_norm": 0.7926028186762867,
"learning_rate": 5.7078318634641456e-05,
"loss": 11.4408,
"num_tokens": 6671021.0,
"step": 934
},
{
"epoch": 0.6533892382948987,
"grad_norm": 0.8186251647673696,
"learning_rate": 5.687399855345879e-05,
"loss": 11.4383,
"num_tokens": 6677620.0,
"step": 935
},
{
"epoch": 0.6540880503144654,
"grad_norm": 0.7280289619725115,
"learning_rate": 5.666989940488496e-05,
"loss": 11.417,
"num_tokens": 6685101.0,
"step": 936
},
{
"epoch": 0.6547868623340322,
"grad_norm": 0.7915454831387722,
"learning_rate": 5.646602223451094e-05,
"loss": 11.2694,
"num_tokens": 6692207.0,
"step": 937
},
{
"epoch": 0.6554856743535988,
"grad_norm": 0.7279570365472292,
"learning_rate": 5.6262368086790504e-05,
"loss": 11.313,
"num_tokens": 6699759.0,
"step": 938
},
{
"epoch": 0.6561844863731656,
"grad_norm": 0.7968453443976489,
"learning_rate": 5.605893800503484e-05,
"loss": 11.3699,
"num_tokens": 6706906.0,
"step": 939
},
{
"epoch": 0.6568832983927324,
"grad_norm": 0.8290722608319229,
"learning_rate": 5.585573303140741e-05,
"loss": 11.4912,
"num_tokens": 6713394.0,
"step": 940
},
{
"epoch": 0.6575821104122991,
"grad_norm": 0.7823549978752478,
"learning_rate": 5.565275420691831e-05,
"loss": 11.3901,
"num_tokens": 6720211.0,
"step": 941
},
{
"epoch": 0.6582809224318659,
"grad_norm": 0.7517377792179483,
"learning_rate": 5.5450002571419104e-05,
"loss": 11.5227,
"num_tokens": 6727718.0,
"step": 942
},
{
"epoch": 0.6589797344514325,
"grad_norm": 0.7578102737888668,
"learning_rate": 5.524747916359756e-05,
"loss": 11.3185,
"num_tokens": 6735027.0,
"step": 943
},
{
"epoch": 0.6596785464709993,
"grad_norm": 0.7294348627795622,
"learning_rate": 5.504518502097212e-05,
"loss": 11.4193,
"num_tokens": 6742667.0,
"step": 944
},
{
"epoch": 0.660377358490566,
"grad_norm": 0.7333256228607329,
"learning_rate": 5.484312117988687e-05,
"loss": 11.415,
"num_tokens": 6750129.0,
"step": 945
},
{
"epoch": 0.6610761705101328,
"grad_norm": 0.7629669936154013,
"learning_rate": 5.464128867550593e-05,
"loss": 11.3356,
"num_tokens": 6756898.0,
"step": 946
},
{
"epoch": 0.6617749825296995,
"grad_norm": 0.7752564686857137,
"learning_rate": 5.4439688541808345e-05,
"loss": 11.5905,
"num_tokens": 6763921.0,
"step": 947
},
{
"epoch": 0.6624737945492662,
"grad_norm": 0.755404720417447,
"learning_rate": 5.423832181158274e-05,
"loss": 11.3755,
"num_tokens": 6771091.0,
"step": 948
},
{
"epoch": 0.663172606568833,
"grad_norm": 0.72492465071565,
"learning_rate": 5.4037189516422e-05,
"loss": 11.2847,
"num_tokens": 6778488.0,
"step": 949
},
{
"epoch": 0.6638714185883997,
"grad_norm": 0.7336264900623639,
"learning_rate": 5.383629268671804e-05,
"loss": 11.2539,
"num_tokens": 6785464.0,
"step": 950
},
{
"epoch": 0.6645702306079665,
"grad_norm": 0.8327633064762886,
"learning_rate": 5.3635632351656495e-05,
"loss": 11.5402,
"num_tokens": 6792243.0,
"step": 951
},
{
"epoch": 0.6652690426275332,
"grad_norm": 0.7237982353291018,
"learning_rate": 5.3435209539211394e-05,
"loss": 11.416,
"num_tokens": 6799493.0,
"step": 952
},
{
"epoch": 0.6659678546471,
"grad_norm": 0.7303663246452419,
"learning_rate": 5.323502527614007e-05,
"loss": 11.3044,
"num_tokens": 6806467.0,
"step": 953
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.8181397264078049,
"learning_rate": 5.303508058797766e-05,
"loss": 11.4364,
"num_tokens": 6813306.0,
"step": 954
},
{
"epoch": 0.6673654786862334,
"grad_norm": 0.7355370180850539,
"learning_rate": 5.2835376499031955e-05,
"loss": 11.1692,
"num_tokens": 6820518.0,
"step": 955
},
{
"epoch": 0.6680642907058001,
"grad_norm": 0.7609137303788498,
"learning_rate": 5.263591403237831e-05,
"loss": 11.3905,
"num_tokens": 6827100.0,
"step": 956
},
{
"epoch": 0.6687631027253669,
"grad_norm": 0.8203786627444394,
"learning_rate": 5.243669420985413e-05,
"loss": 11.359,
"num_tokens": 6833940.0,
"step": 957
},
{
"epoch": 0.6694619147449337,
"grad_norm": 0.752993836488806,
"learning_rate": 5.22377180520538e-05,
"loss": 11.5737,
"num_tokens": 6841696.0,
"step": 958
},
{
"epoch": 0.6701607267645003,
"grad_norm": 0.7162845303848641,
"learning_rate": 5.2038986578323437e-05,
"loss": 11.5206,
"num_tokens": 6849308.0,
"step": 959
},
{
"epoch": 0.6708595387840671,
"grad_norm": 0.7601708724578571,
"learning_rate": 5.1840500806755575e-05,
"loss": 11.2212,
"num_tokens": 6856816.0,
"step": 960
},
{
"epoch": 0.6715583508036338,
"grad_norm": 0.8158602127134996,
"learning_rate": 5.164226175418421e-05,
"loss": 11.4374,
"num_tokens": 6863209.0,
"step": 961
},
{
"epoch": 0.6722571628232006,
"grad_norm": 0.7759684172814786,
"learning_rate": 5.1444270436179185e-05,
"loss": 11.1977,
"num_tokens": 6869856.0,
"step": 962
},
{
"epoch": 0.6729559748427673,
"grad_norm": 0.7307099197973029,
"learning_rate": 5.12465278670414e-05,
"loss": 11.3729,
"num_tokens": 6877774.0,
"step": 963
},
{
"epoch": 0.673654786862334,
"grad_norm": 0.7998954262132016,
"learning_rate": 5.10490350597973e-05,
"loss": 11.3021,
"num_tokens": 6884547.0,
"step": 964
},
{
"epoch": 0.6743535988819007,
"grad_norm": 0.829376113016866,
"learning_rate": 5.085179302619383e-05,
"loss": 11.4544,
"num_tokens": 6892114.0,
"step": 965
},
{
"epoch": 0.6750524109014675,
"grad_norm": 0.8124137234626652,
"learning_rate": 5.06548027766933e-05,
"loss": 11.5562,
"num_tokens": 6899318.0,
"step": 966
},
{
"epoch": 0.6757512229210343,
"grad_norm": 0.731877974221252,
"learning_rate": 5.045806532046806e-05,
"loss": 11.3625,
"num_tokens": 6906566.0,
"step": 967
},
{
"epoch": 0.676450034940601,
"grad_norm": 0.7960249114523722,
"learning_rate": 5.0261581665395475e-05,
"loss": 11.394,
"num_tokens": 6913114.0,
"step": 968
},
{
"epoch": 0.6771488469601677,
"grad_norm": 0.7471261094794464,
"learning_rate": 5.006535281805265e-05,
"loss": 11.2724,
"num_tokens": 6920642.0,
"step": 969
},
{
"epoch": 0.6778476589797344,
"grad_norm": 0.7394163224711746,
"learning_rate": 4.9869379783711315e-05,
"loss": 11.2185,
"num_tokens": 6927787.0,
"step": 970
},
{
"epoch": 0.6785464709993012,
"grad_norm": 0.7482808846351563,
"learning_rate": 4.967366356633275e-05,
"loss": 11.2588,
"num_tokens": 6935406.0,
"step": 971
},
{
"epoch": 0.6792452830188679,
"grad_norm": 0.7449895829598722,
"learning_rate": 4.947820516856253e-05,
"loss": 11.4155,
"num_tokens": 6942968.0,
"step": 972
},
{
"epoch": 0.6799440950384347,
"grad_norm": 0.7866416855660652,
"learning_rate": 4.9283005591725375e-05,
"loss": 11.4755,
"num_tokens": 6949948.0,
"step": 973
},
{
"epoch": 0.6806429070580013,
"grad_norm": 0.7398246001909144,
"learning_rate": 4.908806583582021e-05,
"loss": 11.4443,
"num_tokens": 6957454.0,
"step": 974
},
{
"epoch": 0.6813417190775681,
"grad_norm": 0.7809576116461072,
"learning_rate": 4.8893386899514746e-05,
"loss": 11.3265,
"num_tokens": 6964979.0,
"step": 975
},
{
"epoch": 0.6820405310971349,
"grad_norm": 0.7263811286969053,
"learning_rate": 4.869896978014071e-05,
"loss": 11.4017,
"num_tokens": 6972329.0,
"step": 976
},
{
"epoch": 0.6827393431167016,
"grad_norm": 0.7922288974239042,
"learning_rate": 4.85048154736884e-05,
"loss": 11.4244,
"num_tokens": 6978773.0,
"step": 977
},
{
"epoch": 0.6834381551362684,
"grad_norm": 0.7347442860022921,
"learning_rate": 4.831092497480179e-05,
"loss": 11.3205,
"num_tokens": 6986336.0,
"step": 978
},
{
"epoch": 0.684136967155835,
"grad_norm": 0.8820151378232439,
"learning_rate": 4.81172992767734e-05,
"loss": 11.3669,
"num_tokens": 6992944.0,
"step": 979
},
{
"epoch": 0.6848357791754018,
"grad_norm": 0.8114183972043456,
"learning_rate": 4.792393937153914e-05,
"loss": 11.4598,
"num_tokens": 6999683.0,
"step": 980
},
{
"epoch": 0.6855345911949685,
"grad_norm": 0.7694550494205694,
"learning_rate": 4.773084624967327e-05,
"loss": 11.2862,
"num_tokens": 7006810.0,
"step": 981
},
{
"epoch": 0.6862334032145353,
"grad_norm": 0.8035810017403291,
"learning_rate": 4.753802090038344e-05,
"loss": 11.6007,
"num_tokens": 7013925.0,
"step": 982
},
{
"epoch": 0.686932215234102,
"grad_norm": 0.8779508449006704,
"learning_rate": 4.734546431150536e-05,
"loss": 11.5836,
"num_tokens": 7020571.0,
"step": 983
},
{
"epoch": 0.6876310272536688,
"grad_norm": 0.7782965367791496,
"learning_rate": 4.715317746949804e-05,
"loss": 11.5124,
"num_tokens": 7027401.0,
"step": 984
},
{
"epoch": 0.6883298392732355,
"grad_norm": 0.7429245384849146,
"learning_rate": 4.6961161359438486e-05,
"loss": 11.2208,
"num_tokens": 7034682.0,
"step": 985
},
{
"epoch": 0.6890286512928022,
"grad_norm": 0.768180423011357,
"learning_rate": 4.676941696501673e-05,
"loss": 11.3495,
"num_tokens": 7042175.0,
"step": 986
},
{
"epoch": 0.689727463312369,
"grad_norm": 0.8158507454164047,
"learning_rate": 4.657794526853096e-05,
"loss": 11.4224,
"num_tokens": 7048976.0,
"step": 987
},
{
"epoch": 0.6904262753319357,
"grad_norm": 0.7416723415602191,
"learning_rate": 4.6386747250882224e-05,
"loss": 11.3724,
"num_tokens": 7056210.0,
"step": 988
},
{
"epoch": 0.6911250873515025,
"grad_norm": 0.7862281657651232,
"learning_rate": 4.6195823891569545e-05,
"loss": 11.5203,
"num_tokens": 7063291.0,
"step": 989
},
{
"epoch": 0.6918238993710691,
"grad_norm": 0.7947232569258293,
"learning_rate": 4.60051761686849e-05,
"loss": 11.3728,
"num_tokens": 7070074.0,
"step": 990
},
{
"epoch": 0.6925227113906359,
"grad_norm": 0.7564910310163546,
"learning_rate": 4.581480505890816e-05,
"loss": 11.2835,
"num_tokens": 7077328.0,
"step": 991
},
{
"epoch": 0.6932215234102026,
"grad_norm": 0.7559569974710254,
"learning_rate": 4.5624711537502206e-05,
"loss": 11.4323,
"num_tokens": 7084251.0,
"step": 992
},
{
"epoch": 0.6939203354297694,
"grad_norm": 0.7701109539670761,
"learning_rate": 4.543489657830777e-05,
"loss": 11.3439,
"num_tokens": 7091370.0,
"step": 993
},
{
"epoch": 0.6946191474493362,
"grad_norm": 0.8687082504202258,
"learning_rate": 4.52453611537385e-05,
"loss": 11.418,
"num_tokens": 7097992.0,
"step": 994
},
{
"epoch": 0.6953179594689028,
"grad_norm": 0.8075508267512269,
"learning_rate": 4.505610623477611e-05,
"loss": 11.3894,
"num_tokens": 7104219.0,
"step": 995
},
{
"epoch": 0.6960167714884696,
"grad_norm": 0.7584503225018251,
"learning_rate": 4.486713279096515e-05,
"loss": 11.2692,
"num_tokens": 7110986.0,
"step": 996
},
{
"epoch": 0.6967155835080363,
"grad_norm": 0.7316503352080553,
"learning_rate": 4.4678441790408335e-05,
"loss": 11.431,
"num_tokens": 7118553.0,
"step": 997
},
{
"epoch": 0.6974143955276031,
"grad_norm": 0.7804898512960603,
"learning_rate": 4.449003419976133e-05,
"loss": 11.2494,
"num_tokens": 7125671.0,
"step": 998
},
{
"epoch": 0.6981132075471698,
"grad_norm": 0.7665956804053953,
"learning_rate": 4.430191098422795e-05,
"loss": 11.172,
"num_tokens": 7132247.0,
"step": 999
},
{
"epoch": 0.6988120195667366,
"grad_norm": 0.8340556176005602,
"learning_rate": 4.411407310755513e-05,
"loss": 11.4609,
"num_tokens": 7138298.0,
"step": 1000
},
{
"epoch": 0.6995108315863033,
"grad_norm": 0.7808514429539747,
"learning_rate": 4.392652153202802e-05,
"loss": 11.485,
"num_tokens": 7144756.0,
"step": 1001
},
{
"epoch": 0.70020964360587,
"grad_norm": 0.7536778557900022,
"learning_rate": 4.373925721846519e-05,
"loss": 11.3155,
"num_tokens": 7152146.0,
"step": 1002
},
{
"epoch": 0.7009084556254368,
"grad_norm": 0.7235512526836008,
"learning_rate": 4.355228112621341e-05,
"loss": 11.3711,
"num_tokens": 7159343.0,
"step": 1003
},
{
"epoch": 0.7016072676450035,
"grad_norm": 0.7842398648638949,
"learning_rate": 4.336559421314298e-05,
"loss": 11.3397,
"num_tokens": 7165846.0,
"step": 1004
},
{
"epoch": 0.7023060796645703,
"grad_norm": 0.7675919005184618,
"learning_rate": 4.317919743564278e-05,
"loss": 11.4522,
"num_tokens": 7173032.0,
"step": 1005
},
{
"epoch": 0.7030048916841369,
"grad_norm": 0.7870096182872951,
"learning_rate": 4.29930917486153e-05,
"loss": 11.4789,
"num_tokens": 7179676.0,
"step": 1006
},
{
"epoch": 0.7037037037037037,
"grad_norm": 0.7135708920198698,
"learning_rate": 4.2807278105471735e-05,
"loss": 11.2755,
"num_tokens": 7187102.0,
"step": 1007
},
{
"epoch": 0.7044025157232704,
"grad_norm": 0.8003412427834519,
"learning_rate": 4.2621757458127285e-05,
"loss": 11.3892,
"num_tokens": 7194233.0,
"step": 1008
},
{
"epoch": 0.7051013277428372,
"grad_norm": 0.7585197255574855,
"learning_rate": 4.243653075699604e-05,
"loss": 11.3803,
"num_tokens": 7200796.0,
"step": 1009
},
{
"epoch": 0.705800139762404,
"grad_norm": 0.8452156813049073,
"learning_rate": 4.2251598950986226e-05,
"loss": 11.4087,
"num_tokens": 7207142.0,
"step": 1010
},
{
"epoch": 0.7064989517819706,
"grad_norm": 0.7835457389050836,
"learning_rate": 4.2066962987495376e-05,
"loss": 11.4664,
"num_tokens": 7213920.0,
"step": 1011
},
{
"epoch": 0.7071977638015374,
"grad_norm": 0.7451177469095189,
"learning_rate": 4.188262381240534e-05,
"loss": 11.5746,
"num_tokens": 7221271.0,
"step": 1012
},
{
"epoch": 0.7078965758211041,
"grad_norm": 0.6745508455944015,
"learning_rate": 4.169858237007772e-05,
"loss": 11.3237,
"num_tokens": 7229149.0,
"step": 1013
},
{
"epoch": 0.7085953878406709,
"grad_norm": 0.7325516422452724,
"learning_rate": 4.151483960334862e-05,
"loss": 11.2666,
"num_tokens": 7236889.0,
"step": 1014
},
{
"epoch": 0.7092941998602376,
"grad_norm": 0.7597756270035179,
"learning_rate": 4.133139645352425e-05,
"loss": 11.4504,
"num_tokens": 7243535.0,
"step": 1015
},
{
"epoch": 0.7099930118798043,
"grad_norm": 0.802411771746198,
"learning_rate": 4.114825386037576e-05,
"loss": 11.5466,
"num_tokens": 7249939.0,
"step": 1016
},
{
"epoch": 0.710691823899371,
"grad_norm": 0.6745688733699289,
"learning_rate": 4.0965412762134556e-05,
"loss": 11.3435,
"num_tokens": 7257993.0,
"step": 1017
},
{
"epoch": 0.7113906359189378,
"grad_norm": 0.8172015168604911,
"learning_rate": 4.078287409548763e-05,
"loss": 11.489,
"num_tokens": 7264252.0,
"step": 1018
},
{
"epoch": 0.7120894479385046,
"grad_norm": 0.7313370944739739,
"learning_rate": 4.060063879557249e-05,
"loss": 11.1882,
"num_tokens": 7271420.0,
"step": 1019
},
{
"epoch": 0.7127882599580713,
"grad_norm": 0.7413343223804779,
"learning_rate": 4.0418707795972574e-05,
"loss": 11.5518,
"num_tokens": 7278894.0,
"step": 1020
},
{
"epoch": 0.713487071977638,
"grad_norm": 0.70036680794593,
"learning_rate": 4.023708202871239e-05,
"loss": 11.2365,
"num_tokens": 7286940.0,
"step": 1021
},
{
"epoch": 0.7141858839972047,
"grad_norm": 0.6917658234021815,
"learning_rate": 4.005576242425272e-05,
"loss": 11.0293,
"num_tokens": 7294745.0,
"step": 1022
},
{
"epoch": 0.7148846960167715,
"grad_norm": 0.7723810701467926,
"learning_rate": 3.9874749911485995e-05,
"loss": 11.3387,
"num_tokens": 7301959.0,
"step": 1023
},
{
"epoch": 0.7155835080363382,
"grad_norm": 0.7972689072311142,
"learning_rate": 3.969404541773132e-05,
"loss": 11.3007,
"num_tokens": 7308592.0,
"step": 1024
},
{
"epoch": 0.716282320055905,
"grad_norm": 0.7665594285270318,
"learning_rate": 3.951364986872984e-05,
"loss": 11.2227,
"num_tokens": 7315715.0,
"step": 1025
},
{
"epoch": 0.7169811320754716,
"grad_norm": 0.8147865188294503,
"learning_rate": 3.933356418864008e-05,
"loss": 11.3663,
"num_tokens": 7322229.0,
"step": 1026
},
{
"epoch": 0.7176799440950384,
"grad_norm": 0.7563572784068692,
"learning_rate": 3.9153789300033e-05,
"loss": 11.4636,
"num_tokens": 7329104.0,
"step": 1027
},
{
"epoch": 0.7183787561146052,
"grad_norm": 0.6925283206897185,
"learning_rate": 3.8974326123887515e-05,
"loss": 11.1833,
"num_tokens": 7336700.0,
"step": 1028
},
{
"epoch": 0.7190775681341719,
"grad_norm": 0.7362192730624995,
"learning_rate": 3.879517557958554e-05,
"loss": 11.5596,
"num_tokens": 7343801.0,
"step": 1029
},
{
"epoch": 0.7197763801537387,
"grad_norm": 0.8067914900843238,
"learning_rate": 3.861633858490745e-05,
"loss": 11.4494,
"num_tokens": 7350746.0,
"step": 1030
},
{
"epoch": 0.7204751921733054,
"grad_norm": 0.7039344440821224,
"learning_rate": 3.8437816056027296e-05,
"loss": 11.2428,
"num_tokens": 7358560.0,
"step": 1031
},
{
"epoch": 0.7211740041928721,
"grad_norm": 0.7856827734869525,
"learning_rate": 3.82596089075081e-05,
"loss": 11.6543,
"num_tokens": 7365191.0,
"step": 1032
},
{
"epoch": 0.7218728162124388,
"grad_norm": 0.7235482096088951,
"learning_rate": 3.808171805229733e-05,
"loss": 11.3282,
"num_tokens": 7372368.0,
"step": 1033
},
{
"epoch": 0.7225716282320056,
"grad_norm": 0.7387334991329841,
"learning_rate": 3.790414440172197e-05,
"loss": 11.3636,
"num_tokens": 7379881.0,
"step": 1034
},
{
"epoch": 0.7232704402515723,
"grad_norm": 0.7126819433630927,
"learning_rate": 3.7726888865484e-05,
"loss": 11.2277,
"num_tokens": 7386744.0,
"step": 1035
},
{
"epoch": 0.7239692522711391,
"grad_norm": 0.662827736472648,
"learning_rate": 3.754995235165579e-05,
"loss": 11.1434,
"num_tokens": 7394571.0,
"step": 1036
},
{
"epoch": 0.7246680642907058,
"grad_norm": 0.7051999387799784,
"learning_rate": 3.73733357666753e-05,
"loss": 11.2589,
"num_tokens": 7402101.0,
"step": 1037
},
{
"epoch": 0.7253668763102725,
"grad_norm": 0.7341999680956166,
"learning_rate": 3.719704001534149e-05,
"loss": 11.3894,
"num_tokens": 7409120.0,
"step": 1038
},
{
"epoch": 0.7260656883298393,
"grad_norm": 0.7281969314553532,
"learning_rate": 3.702106600080979e-05,
"loss": 11.4819,
"num_tokens": 7416318.0,
"step": 1039
},
{
"epoch": 0.726764500349406,
"grad_norm": 0.7315483076557149,
"learning_rate": 3.6845414624587326e-05,
"loss": 11.2987,
"num_tokens": 7423580.0,
"step": 1040
},
{
"epoch": 0.7274633123689728,
"grad_norm": 0.6970166205503779,
"learning_rate": 3.667008678652837e-05,
"loss": 11.3018,
"num_tokens": 7431059.0,
"step": 1041
},
{
"epoch": 0.7281621243885394,
"grad_norm": 0.6885063172154732,
"learning_rate": 3.6495083384829723e-05,
"loss": 11.2188,
"num_tokens": 7438722.0,
"step": 1042
},
{
"epoch": 0.7288609364081062,
"grad_norm": 0.7822956671181562,
"learning_rate": 3.6320405316026074e-05,
"loss": 11.6013,
"num_tokens": 7445317.0,
"step": 1043
},
{
"epoch": 0.7295597484276729,
"grad_norm": 0.7646739355090845,
"learning_rate": 3.6146053474985564e-05,
"loss": 11.5241,
"num_tokens": 7452194.0,
"step": 1044
},
{
"epoch": 0.7302585604472397,
"grad_norm": 0.6713250992120214,
"learning_rate": 3.597202875490494e-05,
"loss": 11.2791,
"num_tokens": 7460052.0,
"step": 1045
},
{
"epoch": 0.7309573724668065,
"grad_norm": 0.745173558135304,
"learning_rate": 3.579833204730525e-05,
"loss": 11.3639,
"num_tokens": 7467357.0,
"step": 1046
},
{
"epoch": 0.7316561844863732,
"grad_norm": 0.8031651896658117,
"learning_rate": 3.562496424202707e-05,
"loss": 11.3606,
"num_tokens": 7474027.0,
"step": 1047
},
{
"epoch": 0.7323549965059399,
"grad_norm": 0.7166095564634701,
"learning_rate": 3.5451926227225997e-05,
"loss": 11.2742,
"num_tokens": 7481232.0,
"step": 1048
},
{
"epoch": 0.7330538085255066,
"grad_norm": 0.7326672859023466,
"learning_rate": 3.5279218889368225e-05,
"loss": 11.3805,
"num_tokens": 7487946.0,
"step": 1049
},
{
"epoch": 0.7337526205450734,
"grad_norm": 0.7416236009384238,
"learning_rate": 3.5106843113225854e-05,
"loss": 11.3072,
"num_tokens": 7494713.0,
"step": 1050
},
{
"epoch": 0.7344514325646401,
"grad_norm": 0.7012124715014632,
"learning_rate": 3.493479978187236e-05,
"loss": 11.3789,
"num_tokens": 7502002.0,
"step": 1051
},
{
"epoch": 0.7351502445842069,
"grad_norm": 0.7221924272062566,
"learning_rate": 3.4763089776678203e-05,
"loss": 11.3354,
"num_tokens": 7509025.0,
"step": 1052
},
{
"epoch": 0.7358490566037735,
"grad_norm": 0.7724816602138137,
"learning_rate": 3.459171397730614e-05,
"loss": 11.3475,
"num_tokens": 7515456.0,
"step": 1053
},
{
"epoch": 0.7365478686233403,
"grad_norm": 0.7475946730617843,
"learning_rate": 3.44206732617069e-05,
"loss": 11.3963,
"num_tokens": 7522493.0,
"step": 1054
},
{
"epoch": 0.7372466806429071,
"grad_norm": 0.7683500970071147,
"learning_rate": 3.424996850611455e-05,
"loss": 11.2392,
"num_tokens": 7529387.0,
"step": 1055
},
{
"epoch": 0.7379454926624738,
"grad_norm": 0.6733763063940768,
"learning_rate": 3.4079600585041996e-05,
"loss": 10.9176,
"num_tokens": 7537726.0,
"step": 1056
},
{
"epoch": 0.7386443046820406,
"grad_norm": 0.697410181297856,
"learning_rate": 3.3909570371276654e-05,
"loss": 11.4406,
"num_tokens": 7545113.0,
"step": 1057
},
{
"epoch": 0.7393431167016072,
"grad_norm": 0.8385269941717998,
"learning_rate": 3.3739878735875796e-05,
"loss": 11.4465,
"num_tokens": 7552056.0,
"step": 1058
},
{
"epoch": 0.740041928721174,
"grad_norm": 0.6766701266960187,
"learning_rate": 3.357052654816225e-05,
"loss": 11.2494,
"num_tokens": 7559863.0,
"step": 1059
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.7308228283594846,
"learning_rate": 3.3401514675719816e-05,
"loss": 11.4521,
"num_tokens": 7567323.0,
"step": 1060
},
{
"epoch": 0.7414395527603075,
"grad_norm": 0.7646099741455271,
"learning_rate": 3.323284398438886e-05,
"loss": 11.242,
"num_tokens": 7573959.0,
"step": 1061
},
{
"epoch": 0.7421383647798742,
"grad_norm": 0.7427632832724356,
"learning_rate": 3.306451533826194e-05,
"loss": 11.4877,
"num_tokens": 7580996.0,
"step": 1062
},
{
"epoch": 0.742837176799441,
"grad_norm": 0.7581322530475002,
"learning_rate": 3.289652959967925e-05,
"loss": 11.5307,
"num_tokens": 7587525.0,
"step": 1063
},
{
"epoch": 0.7435359888190077,
"grad_norm": 0.7028610260607215,
"learning_rate": 3.272888762922442e-05,
"loss": 11.4367,
"num_tokens": 7595094.0,
"step": 1064
},
{
"epoch": 0.7442348008385744,
"grad_norm": 0.7017221063986278,
"learning_rate": 3.2561590285719856e-05,
"loss": 11.276,
"num_tokens": 7602618.0,
"step": 1065
},
{
"epoch": 0.7449336128581412,
"grad_norm": 0.6865354079002892,
"learning_rate": 3.2394638426222467e-05,
"loss": 11.1785,
"num_tokens": 7610143.0,
"step": 1066
},
{
"epoch": 0.7456324248777079,
"grad_norm": 0.6740360060533553,
"learning_rate": 3.222803290601934e-05,
"loss": 11.3203,
"num_tokens": 7617786.0,
"step": 1067
},
{
"epoch": 0.7463312368972747,
"grad_norm": 0.7110904217960554,
"learning_rate": 3.20617745786232e-05,
"loss": 11.1755,
"num_tokens": 7624874.0,
"step": 1068
},
{
"epoch": 0.7470300489168413,
"grad_norm": 0.7377273985684303,
"learning_rate": 3.189586429576812e-05,
"loss": 11.3264,
"num_tokens": 7631644.0,
"step": 1069
},
{
"epoch": 0.7477288609364081,
"grad_norm": 0.8518215847668774,
"learning_rate": 3.173030290740524e-05,
"loss": 11.4216,
"num_tokens": 7637610.0,
"step": 1070
},
{
"epoch": 0.7484276729559748,
"grad_norm": 0.731189174579239,
"learning_rate": 3.1565091261698245e-05,
"loss": 11.2452,
"num_tokens": 7644746.0,
"step": 1071
},
{
"epoch": 0.7491264849755416,
"grad_norm": 0.7307695312815669,
"learning_rate": 3.140023020501912e-05,
"loss": 11.1864,
"num_tokens": 7651980.0,
"step": 1072
},
{
"epoch": 0.7498252969951084,
"grad_norm": 0.7470129712613078,
"learning_rate": 3.1235720581943827e-05,
"loss": 11.341,
"num_tokens": 7658633.0,
"step": 1073
},
{
"epoch": 0.750524109014675,
"grad_norm": 0.6974467676001982,
"learning_rate": 3.107156323524788e-05,
"loss": 11.131,
"num_tokens": 7666388.0,
"step": 1074
},
{
"epoch": 0.7512229210342418,
"grad_norm": 0.7710427604926284,
"learning_rate": 3.0907759005902224e-05,
"loss": 11.3886,
"num_tokens": 7673401.0,
"step": 1075
},
{
"epoch": 0.7519217330538085,
"grad_norm": 0.7722971345364247,
"learning_rate": 3.074430873306865e-05,
"loss": 11.4449,
"num_tokens": 7680465.0,
"step": 1076
},
{
"epoch": 0.7526205450733753,
"grad_norm": 0.7226343668653663,
"learning_rate": 3.058121325409579e-05,
"loss": 11.3678,
"num_tokens": 7687681.0,
"step": 1077
},
{
"epoch": 0.753319357092942,
"grad_norm": 0.7770625505796288,
"learning_rate": 3.041847340451456e-05,
"loss": 11.3737,
"num_tokens": 7694753.0,
"step": 1078
},
{
"epoch": 0.7540181691125087,
"grad_norm": 0.7999608445071387,
"learning_rate": 3.0256090018034046e-05,
"loss": 11.3074,
"num_tokens": 7700964.0,
"step": 1079
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.7507601730126232,
"learning_rate": 3.009406392653723e-05,
"loss": 11.2428,
"num_tokens": 7707592.0,
"step": 1080
},
{
"epoch": 0.7554157931516422,
"grad_norm": 0.6810868569742109,
"learning_rate": 2.993239596007669e-05,
"loss": 11.2451,
"num_tokens": 7715404.0,
"step": 1081
},
{
"epoch": 0.756114605171209,
"grad_norm": 0.6849850433555468,
"learning_rate": 2.9771086946870175e-05,
"loss": 11.172,
"num_tokens": 7722495.0,
"step": 1082
},
{
"epoch": 0.7568134171907757,
"grad_norm": 0.8728657634726412,
"learning_rate": 2.9610137713296783e-05,
"loss": 11.5636,
"num_tokens": 7728379.0,
"step": 1083
},
{
"epoch": 0.7575122292103424,
"grad_norm": 0.739038187201558,
"learning_rate": 2.9449549083892292e-05,
"loss": 11.4312,
"num_tokens": 7735201.0,
"step": 1084
},
{
"epoch": 0.7582110412299091,
"grad_norm": 0.8444559834520016,
"learning_rate": 2.9289321881345254e-05,
"loss": 11.6626,
"num_tokens": 7741618.0,
"step": 1085
},
{
"epoch": 0.7589098532494759,
"grad_norm": 0.7334827192137122,
"learning_rate": 2.9129456926492548e-05,
"loss": 11.2497,
"num_tokens": 7748854.0,
"step": 1086
},
{
"epoch": 0.7596086652690426,
"grad_norm": 0.7214074829950128,
"learning_rate": 2.8969955038315277e-05,
"loss": 11.153,
"num_tokens": 7756291.0,
"step": 1087
},
{
"epoch": 0.7603074772886094,
"grad_norm": 0.7558791668864552,
"learning_rate": 2.8810817033934656e-05,
"loss": 11.1789,
"num_tokens": 7763508.0,
"step": 1088
},
{
"epoch": 0.7610062893081762,
"grad_norm": 0.7137740371199445,
"learning_rate": 2.8652043728607625e-05,
"loss": 11.3707,
"num_tokens": 7771093.0,
"step": 1089
},
{
"epoch": 0.7617051013277428,
"grad_norm": 0.7894834299698231,
"learning_rate": 2.8493635935722928e-05,
"loss": 11.2646,
"num_tokens": 7777723.0,
"step": 1090
},
{
"epoch": 0.7624039133473096,
"grad_norm": 0.7321387669937152,
"learning_rate": 2.8335594466796656e-05,
"loss": 11.3176,
"num_tokens": 7784753.0,
"step": 1091
},
{
"epoch": 0.7631027253668763,
"grad_norm": 0.783647730922316,
"learning_rate": 2.8177920131468273e-05,
"loss": 11.5208,
"num_tokens": 7791905.0,
"step": 1092
},
{
"epoch": 0.7638015373864431,
"grad_norm": 0.7429571159412155,
"learning_rate": 2.8020613737496547e-05,
"loss": 11.3798,
"num_tokens": 7799579.0,
"step": 1093
},
{
"epoch": 0.7645003494060097,
"grad_norm": 0.7472473035800477,
"learning_rate": 2.7863676090755176e-05,
"loss": 11.4021,
"num_tokens": 7806386.0,
"step": 1094
},
{
"epoch": 0.7651991614255765,
"grad_norm": 0.6754659129161072,
"learning_rate": 2.770710799522879e-05,
"loss": 11.2082,
"num_tokens": 7814116.0,
"step": 1095
},
{
"epoch": 0.7658979734451432,
"grad_norm": 0.7693144724539596,
"learning_rate": 2.7550910253008933e-05,
"loss": 11.3185,
"num_tokens": 7820763.0,
"step": 1096
},
{
"epoch": 0.76659678546471,
"grad_norm": 0.7091084838894971,
"learning_rate": 2.739508366428969e-05,
"loss": 11.2438,
"num_tokens": 7828321.0,
"step": 1097
},
{
"epoch": 0.7672955974842768,
"grad_norm": 0.705869316522346,
"learning_rate": 2.723962902736389e-05,
"loss": 11.2762,
"num_tokens": 7835798.0,
"step": 1098
},
{
"epoch": 0.7679944095038435,
"grad_norm": 0.7138100544611918,
"learning_rate": 2.7084547138618778e-05,
"loss": 11.3148,
"num_tokens": 7842754.0,
"step": 1099
},
{
"epoch": 0.7686932215234102,
"grad_norm": 0.7151217684539595,
"learning_rate": 2.6929838792532037e-05,
"loss": 11.217,
"num_tokens": 7849613.0,
"step": 1100
},
{
"epoch": 0.7693920335429769,
"grad_norm": 0.7188728094758737,
"learning_rate": 2.6775504781667725e-05,
"loss": 11.3475,
"num_tokens": 7856848.0,
"step": 1101
},
{
"epoch": 0.7700908455625437,
"grad_norm": 0.7052028694843995,
"learning_rate": 2.6621545896672174e-05,
"loss": 11.3796,
"num_tokens": 7863992.0,
"step": 1102
},
{
"epoch": 0.7707896575821104,
"grad_norm": 0.6914477298742575,
"learning_rate": 2.6467962926270017e-05,
"loss": 11.3365,
"num_tokens": 7872390.0,
"step": 1103
},
{
"epoch": 0.7714884696016772,
"grad_norm": 0.7137530093694683,
"learning_rate": 2.6314756657260054e-05,
"loss": 11.2455,
"num_tokens": 7879165.0,
"step": 1104
},
{
"epoch": 0.7721872816212438,
"grad_norm": 0.7247684771863809,
"learning_rate": 2.6161927874511216e-05,
"loss": 11.2368,
"num_tokens": 7886459.0,
"step": 1105
},
{
"epoch": 0.7728860936408106,
"grad_norm": 0.7194841517185353,
"learning_rate": 2.6009477360958712e-05,
"loss": 11.3713,
"num_tokens": 7893611.0,
"step": 1106
},
{
"epoch": 0.7735849056603774,
"grad_norm": 0.7026960816920488,
"learning_rate": 2.585740589759976e-05,
"loss": 11.2143,
"num_tokens": 7900850.0,
"step": 1107
},
{
"epoch": 0.7742837176799441,
"grad_norm": 0.7434038696739714,
"learning_rate": 2.5705714263489776e-05,
"loss": 11.2671,
"num_tokens": 7907796.0,
"step": 1108
},
{
"epoch": 0.7749825296995109,
"grad_norm": 0.7487747923849204,
"learning_rate": 2.555440323573839e-05,
"loss": 11.4015,
"num_tokens": 7915024.0,
"step": 1109
},
{
"epoch": 0.7756813417190775,
"grad_norm": 0.79745216396254,
"learning_rate": 2.540347358950529e-05,
"loss": 11.1625,
"num_tokens": 7921430.0,
"step": 1110
},
{
"epoch": 0.7763801537386443,
"grad_norm": 0.7596177805629818,
"learning_rate": 2.5252926097996445e-05,
"loss": 11.4533,
"num_tokens": 7928272.0,
"step": 1111
},
{
"epoch": 0.777078965758211,
"grad_norm": 0.7350154935054022,
"learning_rate": 2.5102761532460008e-05,
"loss": 11.4029,
"num_tokens": 7934944.0,
"step": 1112
},
{
"epoch": 0.7777777777777778,
"grad_norm": 0.7499238754252547,
"learning_rate": 2.4952980662182425e-05,
"loss": 11.1187,
"num_tokens": 7941339.0,
"step": 1113
},
{
"epoch": 0.7784765897973445,
"grad_norm": 0.7453807382953898,
"learning_rate": 2.4803584254484568e-05,
"loss": 11.3362,
"num_tokens": 7948332.0,
"step": 1114
},
{
"epoch": 0.7791754018169113,
"grad_norm": 0.746935094009976,
"learning_rate": 2.4654573074717602e-05,
"loss": 11.1033,
"num_tokens": 7955689.0,
"step": 1115
},
{
"epoch": 0.779874213836478,
"grad_norm": 0.7132201817546339,
"learning_rate": 2.4505947886259318e-05,
"loss": 11.4412,
"num_tokens": 7962958.0,
"step": 1116
},
{
"epoch": 0.7805730258560447,
"grad_norm": 0.6636018994598554,
"learning_rate": 2.435770945050997e-05,
"loss": 11.1383,
"num_tokens": 7970832.0,
"step": 1117
},
{
"epoch": 0.7812718378756115,
"grad_norm": 0.7607115224553903,
"learning_rate": 2.420985852688854e-05,
"loss": 11.4823,
"num_tokens": 7977881.0,
"step": 1118
},
{
"epoch": 0.7819706498951782,
"grad_norm": 0.6604593093823747,
"learning_rate": 2.4062395872828846e-05,
"loss": 11.2511,
"num_tokens": 7985659.0,
"step": 1119
},
{
"epoch": 0.782669461914745,
"grad_norm": 0.7004472719825999,
"learning_rate": 2.3915322243775562e-05,
"loss": 11.1951,
"num_tokens": 7993219.0,
"step": 1120
},
{
"epoch": 0.7833682739343116,
"grad_norm": 0.7154398128015275,
"learning_rate": 2.3768638393180407e-05,
"loss": 11.2984,
"num_tokens": 8000242.0,
"step": 1121
},
{
"epoch": 0.7840670859538784,
"grad_norm": 0.7194741079202231,
"learning_rate": 2.362234507249832e-05,
"loss": 11.3538,
"num_tokens": 8007589.0,
"step": 1122
},
{
"epoch": 0.7847658979734451,
"grad_norm": 0.665405485222391,
"learning_rate": 2.3476443031183503e-05,
"loss": 11.3998,
"num_tokens": 8015113.0,
"step": 1123
},
{
"epoch": 0.7854647099930119,
"grad_norm": 0.7497006916629403,
"learning_rate": 2.3330933016685754e-05,
"loss": 11.3447,
"num_tokens": 8021703.0,
"step": 1124
},
{
"epoch": 0.7861635220125787,
"grad_norm": 0.724069609414905,
"learning_rate": 2.318581577444646e-05,
"loss": 11.2892,
"num_tokens": 8028563.0,
"step": 1125
},
{
"epoch": 0.7868623340321453,
"grad_norm": 0.7369772430147077,
"learning_rate": 2.304109204789484e-05,
"loss": 11.3706,
"num_tokens": 8035481.0,
"step": 1126
},
{
"epoch": 0.7875611460517121,
"grad_norm": 0.7210670035313614,
"learning_rate": 2.289676257844423e-05,
"loss": 11.4219,
"num_tokens": 8042374.0,
"step": 1127
},
{
"epoch": 0.7882599580712788,
"grad_norm": 0.6806882221443891,
"learning_rate": 2.275282810548811e-05,
"loss": 11.2311,
"num_tokens": 8050149.0,
"step": 1128
},
{
"epoch": 0.7889587700908456,
"grad_norm": 0.7336078475191532,
"learning_rate": 2.2609289366396502e-05,
"loss": 11.3744,
"num_tokens": 8056810.0,
"step": 1129
},
{
"epoch": 0.7896575821104123,
"grad_norm": 0.7276364501652224,
"learning_rate": 2.2466147096512035e-05,
"loss": 11.5052,
"num_tokens": 8063712.0,
"step": 1130
},
{
"epoch": 0.790356394129979,
"grad_norm": 0.707614090504406,
"learning_rate": 2.2323402029146244e-05,
"loss": 11.2308,
"num_tokens": 8070844.0,
"step": 1131
},
{
"epoch": 0.7910552061495457,
"grad_norm": 0.7077004079623015,
"learning_rate": 2.2181054895575847e-05,
"loss": 11.1824,
"num_tokens": 8078320.0,
"step": 1132
},
{
"epoch": 0.7917540181691125,
"grad_norm": 0.6953790426100739,
"learning_rate": 2.2039106425038924e-05,
"loss": 11.4368,
"num_tokens": 8085826.0,
"step": 1133
},
{
"epoch": 0.7924528301886793,
"grad_norm": 0.7209711307337222,
"learning_rate": 2.189755734473129e-05,
"loss": 11.3935,
"num_tokens": 8092777.0,
"step": 1134
},
{
"epoch": 0.793151642208246,
"grad_norm": 0.808854769776892,
"learning_rate": 2.175640837980265e-05,
"loss": 11.5428,
"num_tokens": 8099130.0,
"step": 1135
},
{
"epoch": 0.7938504542278128,
"grad_norm": 0.801908412177368,
"learning_rate": 2.161566025335289e-05,
"loss": 11.3935,
"num_tokens": 8105524.0,
"step": 1136
},
{
"epoch": 0.7945492662473794,
"grad_norm": 0.7150555268911781,
"learning_rate": 2.1475313686428544e-05,
"loss": 11.2527,
"num_tokens": 8112500.0,
"step": 1137
},
{
"epoch": 0.7952480782669462,
"grad_norm": 0.7357614818712042,
"learning_rate": 2.133536939801888e-05,
"loss": 11.3546,
"num_tokens": 8119456.0,
"step": 1138
},
{
"epoch": 0.7959468902865129,
"grad_norm": 0.7370468321948003,
"learning_rate": 2.1195828105052283e-05,
"loss": 11.2675,
"num_tokens": 8126211.0,
"step": 1139
},
{
"epoch": 0.7966457023060797,
"grad_norm": 0.7794774618969516,
"learning_rate": 2.105669052239274e-05,
"loss": 11.2933,
"num_tokens": 8132659.0,
"step": 1140
},
{
"epoch": 0.7973445143256463,
"grad_norm": 0.7733675605498702,
"learning_rate": 2.091795736283593e-05,
"loss": 11.4373,
"num_tokens": 8139414.0,
"step": 1141
},
{
"epoch": 0.7980433263452131,
"grad_norm": 0.711124356307087,
"learning_rate": 2.0779629337105722e-05,
"loss": 11.1578,
"num_tokens": 8146787.0,
"step": 1142
},
{
"epoch": 0.7987421383647799,
"grad_norm": 0.7872909388951395,
"learning_rate": 2.064170715385052e-05,
"loss": 11.2581,
"num_tokens": 8153506.0,
"step": 1143
},
{
"epoch": 0.7994409503843466,
"grad_norm": 0.6555684032466877,
"learning_rate": 2.050419151963957e-05,
"loss": 11.4168,
"num_tokens": 8161093.0,
"step": 1144
},
{
"epoch": 0.8001397624039134,
"grad_norm": 0.7378265255419366,
"learning_rate": 2.0367083138959476e-05,
"loss": 11.3239,
"num_tokens": 8167975.0,
"step": 1145
},
{
"epoch": 0.80083857442348,
"grad_norm": 0.7314500876097451,
"learning_rate": 2.0230382714210384e-05,
"loss": 11.3205,
"num_tokens": 8174799.0,
"step": 1146
},
{
"epoch": 0.8015373864430468,
"grad_norm": 0.7415518531134959,
"learning_rate": 2.0094090945702616e-05,
"loss": 11.4,
"num_tokens": 8181864.0,
"step": 1147
},
{
"epoch": 0.8022361984626135,
"grad_norm": 0.7101043302442525,
"learning_rate": 1.9958208531652877e-05,
"loss": 11.3687,
"num_tokens": 8188903.0,
"step": 1148
},
{
"epoch": 0.8029350104821803,
"grad_norm": 0.6457420702696471,
"learning_rate": 1.9822736168180778e-05,
"loss": 11.3409,
"num_tokens": 8196709.0,
"step": 1149
},
{
"epoch": 0.803633822501747,
"grad_norm": 0.7225646893500235,
"learning_rate": 1.9687674549305335e-05,
"loss": 11.431,
"num_tokens": 8204133.0,
"step": 1150
},
{
"epoch": 0.8043326345213138,
"grad_norm": 0.6803983158734834,
"learning_rate": 1.9553024366941242e-05,
"loss": 11.2953,
"num_tokens": 8211596.0,
"step": 1151
},
{
"epoch": 0.8050314465408805,
"grad_norm": 0.7218921095299768,
"learning_rate": 1.9418786310895464e-05,
"loss": 11.432,
"num_tokens": 8218648.0,
"step": 1152
},
{
"epoch": 0.8057302585604472,
"grad_norm": 0.64913746065341,
"learning_rate": 1.9284961068863673e-05,
"loss": 11.3047,
"num_tokens": 8226878.0,
"step": 1153
},
{
"epoch": 0.806429070580014,
"grad_norm": 0.7074942179242839,
"learning_rate": 1.9151549326426656e-05,
"loss": 11.3206,
"num_tokens": 8234332.0,
"step": 1154
},
{
"epoch": 0.8071278825995807,
"grad_norm": 0.7527787088886351,
"learning_rate": 1.9018551767046966e-05,
"loss": 11.2193,
"num_tokens": 8241077.0,
"step": 1155
},
{
"epoch": 0.8078266946191475,
"grad_norm": 0.7062668084460995,
"learning_rate": 1.8885969072065225e-05,
"loss": 11.1638,
"num_tokens": 8248516.0,
"step": 1156
},
{
"epoch": 0.8085255066387141,
"grad_norm": 0.7424957637297211,
"learning_rate": 1.8753801920696712e-05,
"loss": 11.316,
"num_tokens": 8255076.0,
"step": 1157
},
{
"epoch": 0.8092243186582809,
"grad_norm": 0.6823678136180803,
"learning_rate": 1.8622050990027995e-05,
"loss": 11.2536,
"num_tokens": 8262629.0,
"step": 1158
},
{
"epoch": 0.8099231306778477,
"grad_norm": 0.750023065492982,
"learning_rate": 1.8490716955013232e-05,
"loss": 11.2706,
"num_tokens": 8269455.0,
"step": 1159
},
{
"epoch": 0.8106219426974144,
"grad_norm": 0.7704625910645484,
"learning_rate": 1.8359800488470978e-05,
"loss": 11.3387,
"num_tokens": 8276214.0,
"step": 1160
},
{
"epoch": 0.8113207547169812,
"grad_norm": 0.6826518153130348,
"learning_rate": 1.8229302261080495e-05,
"loss": 11.3443,
"num_tokens": 8283578.0,
"step": 1161
},
{
"epoch": 0.8120195667365478,
"grad_norm": 0.7423036450637122,
"learning_rate": 1.809922294137847e-05,
"loss": 11.4065,
"num_tokens": 8290424.0,
"step": 1162
},
{
"epoch": 0.8127183787561146,
"grad_norm": 0.6793892788712328,
"learning_rate": 1.7969563195755535e-05,
"loss": 11.1861,
"num_tokens": 8298504.0,
"step": 1163
},
{
"epoch": 0.8134171907756813,
"grad_norm": 0.7214794343659785,
"learning_rate": 1.784032368845283e-05,
"loss": 11.4229,
"num_tokens": 8305815.0,
"step": 1164
},
{
"epoch": 0.8141160027952481,
"grad_norm": 0.7485617746043014,
"learning_rate": 1.7711505081558734e-05,
"loss": 11.2375,
"num_tokens": 8312752.0,
"step": 1165
},
{
"epoch": 0.8148148148148148,
"grad_norm": 0.6748388760967671,
"learning_rate": 1.758310803500527e-05,
"loss": 11.3151,
"num_tokens": 8320180.0,
"step": 1166
},
{
"epoch": 0.8155136268343816,
"grad_norm": 0.7313883011335084,
"learning_rate": 1.7455133206564832e-05,
"loss": 11.1296,
"num_tokens": 8327115.0,
"step": 1167
},
{
"epoch": 0.8162124388539483,
"grad_norm": 0.6691961538450998,
"learning_rate": 1.73275812518469e-05,
"loss": 11.2173,
"num_tokens": 8334667.0,
"step": 1168
},
{
"epoch": 0.816911250873515,
"grad_norm": 0.7672032209833499,
"learning_rate": 1.7200452824294498e-05,
"loss": 11.2806,
"num_tokens": 8340918.0,
"step": 1169
},
{
"epoch": 0.8176100628930818,
"grad_norm": 0.7333853304823384,
"learning_rate": 1.707374857518094e-05,
"loss": 11.3178,
"num_tokens": 8347718.0,
"step": 1170
},
{
"epoch": 0.8183088749126485,
"grad_norm": 0.7083154662968923,
"learning_rate": 1.6947469153606577e-05,
"loss": 11.4407,
"num_tokens": 8355388.0,
"step": 1171
},
{
"epoch": 0.8190076869322153,
"grad_norm": 0.7605932556421211,
"learning_rate": 1.6821615206495312e-05,
"loss": 11.4539,
"num_tokens": 8362353.0,
"step": 1172
},
{
"epoch": 0.8197064989517819,
"grad_norm": 0.7196641926932061,
"learning_rate": 1.6696187378591376e-05,
"loss": 11.2854,
"num_tokens": 8369668.0,
"step": 1173
},
{
"epoch": 0.8204053109713487,
"grad_norm": 0.7607679332819676,
"learning_rate": 1.657118631245601e-05,
"loss": 11.5675,
"num_tokens": 8376453.0,
"step": 1174
},
{
"epoch": 0.8211041229909154,
"grad_norm": 0.6781286235128161,
"learning_rate": 1.6446612648464164e-05,
"loss": 11.2414,
"num_tokens": 8384073.0,
"step": 1175
},
{
"epoch": 0.8218029350104822,
"grad_norm": 0.6575078980008394,
"learning_rate": 1.632246702480128e-05,
"loss": 11.0235,
"num_tokens": 8391615.0,
"step": 1176
},
{
"epoch": 0.822501747030049,
"grad_norm": 0.6936210648421214,
"learning_rate": 1.619875007745989e-05,
"loss": 11.2754,
"num_tokens": 8399117.0,
"step": 1177
},
{
"epoch": 0.8232005590496156,
"grad_norm": 0.7861112908302533,
"learning_rate": 1.607546244023651e-05,
"loss": 11.3219,
"num_tokens": 8405508.0,
"step": 1178
},
{
"epoch": 0.8238993710691824,
"grad_norm": 0.7415539090414199,
"learning_rate": 1.5952604744728272e-05,
"loss": 11.4339,
"num_tokens": 8412474.0,
"step": 1179
},
{
"epoch": 0.8245981830887491,
"grad_norm": 0.7613238748546601,
"learning_rate": 1.5830177620329712e-05,
"loss": 11.4025,
"num_tokens": 8419536.0,
"step": 1180
},
{
"epoch": 0.8252969951083159,
"grad_norm": 0.7285477512094749,
"learning_rate": 1.570818169422966e-05,
"loss": 11.3531,
"num_tokens": 8426032.0,
"step": 1181
},
{
"epoch": 0.8259958071278826,
"grad_norm": 0.7939048597348616,
"learning_rate": 1.558661759140786e-05,
"loss": 11.3648,
"num_tokens": 8432351.0,
"step": 1182
},
{
"epoch": 0.8266946191474493,
"grad_norm": 0.7015799677247678,
"learning_rate": 1.5465485934631853e-05,
"loss": 11.237,
"num_tokens": 8439781.0,
"step": 1183
},
{
"epoch": 0.827393431167016,
"grad_norm": 0.6963852994738172,
"learning_rate": 1.5344787344453805e-05,
"loss": 11.3215,
"num_tokens": 8446877.0,
"step": 1184
},
{
"epoch": 0.8280922431865828,
"grad_norm": 0.7447257257726353,
"learning_rate": 1.5224522439207246e-05,
"loss": 11.1872,
"num_tokens": 8453354.0,
"step": 1185
},
{
"epoch": 0.8287910552061496,
"grad_norm": 0.745356152433122,
"learning_rate": 1.5104691835004048e-05,
"loss": 11.3101,
"num_tokens": 8460350.0,
"step": 1186
},
{
"epoch": 0.8294898672257163,
"grad_norm": 0.6723282265681545,
"learning_rate": 1.498529614573111e-05,
"loss": 11.231,
"num_tokens": 8467882.0,
"step": 1187
},
{
"epoch": 0.8301886792452831,
"grad_norm": 0.7694903453602582,
"learning_rate": 1.4866335983047264e-05,
"loss": 11.2441,
"num_tokens": 8474603.0,
"step": 1188
},
{
"epoch": 0.8308874912648497,
"grad_norm": 0.727022997755655,
"learning_rate": 1.4747811956380242e-05,
"loss": 11.363,
"num_tokens": 8481543.0,
"step": 1189
},
{
"epoch": 0.8315863032844165,
"grad_norm": 0.7022103489694916,
"learning_rate": 1.4629724672923384e-05,
"loss": 11.2517,
"num_tokens": 8488957.0,
"step": 1190
},
{
"epoch": 0.8322851153039832,
"grad_norm": 0.6861179535933343,
"learning_rate": 1.4512074737632686e-05,
"loss": 11.122,
"num_tokens": 8496305.0,
"step": 1191
},
{
"epoch": 0.83298392732355,
"grad_norm": 0.91578231716021,
"learning_rate": 1.439486275322357e-05,
"loss": 11.2723,
"num_tokens": 8503384.0,
"step": 1192
},
{
"epoch": 0.8336827393431167,
"grad_norm": 0.696492319983534,
"learning_rate": 1.4278089320167876e-05,
"loss": 11.182,
"num_tokens": 8510477.0,
"step": 1193
},
{
"epoch": 0.8343815513626834,
"grad_norm": 0.7326081564411686,
"learning_rate": 1.4161755036690771e-05,
"loss": 11.2417,
"num_tokens": 8517882.0,
"step": 1194
},
{
"epoch": 0.8350803633822502,
"grad_norm": 0.7429335339057346,
"learning_rate": 1.4045860498767671e-05,
"loss": 11.4592,
"num_tokens": 8524376.0,
"step": 1195
},
{
"epoch": 0.8357791754018169,
"grad_norm": 0.7041470336747101,
"learning_rate": 1.3930406300121179e-05,
"loss": 11.3957,
"num_tokens": 8531630.0,
"step": 1196
},
{
"epoch": 0.8364779874213837,
"grad_norm": 0.7287088760207717,
"learning_rate": 1.3815393032218115e-05,
"loss": 11.179,
"num_tokens": 8538262.0,
"step": 1197
},
{
"epoch": 0.8371767994409504,
"grad_norm": 0.7001293925089493,
"learning_rate": 1.3700821284266351e-05,
"loss": 11.165,
"num_tokens": 8545378.0,
"step": 1198
},
{
"epoch": 0.8378756114605171,
"grad_norm": 0.6577452042600329,
"learning_rate": 1.3586691643211957e-05,
"loss": 11.3174,
"num_tokens": 8553221.0,
"step": 1199
},
{
"epoch": 0.8385744234800838,
"grad_norm": 0.8325890793471512,
"learning_rate": 1.3473004693736036e-05,
"loss": 11.3585,
"num_tokens": 8559107.0,
"step": 1200
},
{
"epoch": 0.8392732354996506,
"grad_norm": 0.747352363910281,
"learning_rate": 1.3359761018251826e-05,
"loss": 11.1498,
"num_tokens": 8565926.0,
"step": 1201
},
{
"epoch": 0.8399720475192173,
"grad_norm": 0.7032494382852825,
"learning_rate": 1.324696119690173e-05,
"loss": 11.4202,
"num_tokens": 8573174.0,
"step": 1202
},
{
"epoch": 0.8406708595387841,
"grad_norm": 0.7081165719952596,
"learning_rate": 1.3134605807554246e-05,
"loss": 10.8787,
"num_tokens": 8579902.0,
"step": 1203
},
{
"epoch": 0.8413696715583509,
"grad_norm": 0.7334322609733372,
"learning_rate": 1.302269542580109e-05,
"loss": 11.5137,
"num_tokens": 8586798.0,
"step": 1204
},
{
"epoch": 0.8420684835779175,
"grad_norm": 0.6820085936877066,
"learning_rate": 1.291123062495424e-05,
"loss": 11.0216,
"num_tokens": 8593890.0,
"step": 1205
},
{
"epoch": 0.8427672955974843,
"grad_norm": 0.790988424419638,
"learning_rate": 1.2800211976042941e-05,
"loss": 11.4354,
"num_tokens": 8600460.0,
"step": 1206
},
{
"epoch": 0.843466107617051,
"grad_norm": 0.7027098919987455,
"learning_rate": 1.268964004781089e-05,
"loss": 11.2323,
"num_tokens": 8607572.0,
"step": 1207
},
{
"epoch": 0.8441649196366178,
"grad_norm": 0.7192825937138662,
"learning_rate": 1.2579515406713193e-05,
"loss": 11.3833,
"num_tokens": 8614609.0,
"step": 1208
},
{
"epoch": 0.8448637316561844,
"grad_norm": 0.7134129075930509,
"learning_rate": 1.246983861691352e-05,
"loss": 11.2464,
"num_tokens": 8621756.0,
"step": 1209
},
{
"epoch": 0.8455625436757512,
"grad_norm": 0.7343490934175165,
"learning_rate": 1.236061024028129e-05,
"loss": 11.1625,
"num_tokens": 8628716.0,
"step": 1210
},
{
"epoch": 0.8462613556953179,
"grad_norm": 0.6619816183303324,
"learning_rate": 1.2251830836388622e-05,
"loss": 11.2272,
"num_tokens": 8636472.0,
"step": 1211
},
{
"epoch": 0.8469601677148847,
"grad_norm": 0.6990600383266643,
"learning_rate": 1.214350096250767e-05,
"loss": 11.3979,
"num_tokens": 8643444.0,
"step": 1212
},
{
"epoch": 0.8476589797344515,
"grad_norm": 0.6847030430264577,
"learning_rate": 1.2035621173607581e-05,
"loss": 11.3022,
"num_tokens": 8650747.0,
"step": 1213
},
{
"epoch": 0.8483577917540182,
"grad_norm": 0.71252099329853,
"learning_rate": 1.192819202235178e-05,
"loss": 11.1707,
"num_tokens": 8657676.0,
"step": 1214
},
{
"epoch": 0.8490566037735849,
"grad_norm": 0.6677411887169162,
"learning_rate": 1.1821214059095088e-05,
"loss": 11.1983,
"num_tokens": 8665448.0,
"step": 1215
},
{
"epoch": 0.8497554157931516,
"grad_norm": 0.7048673577714665,
"learning_rate": 1.1714687831880865e-05,
"loss": 11.3894,
"num_tokens": 8672624.0,
"step": 1216
},
{
"epoch": 0.8504542278127184,
"grad_norm": 0.7184006740909511,
"learning_rate": 1.1608613886438346e-05,
"loss": 11.1164,
"num_tokens": 8679416.0,
"step": 1217
},
{
"epoch": 0.8511530398322851,
"grad_norm": 0.701659143428556,
"learning_rate": 1.1502992766179666e-05,
"loss": 11.3558,
"num_tokens": 8686327.0,
"step": 1218
},
{
"epoch": 0.8518518518518519,
"grad_norm": 0.7509863977103985,
"learning_rate": 1.139782501219715e-05,
"loss": 11.2614,
"num_tokens": 8693164.0,
"step": 1219
},
{
"epoch": 0.8525506638714185,
"grad_norm": 0.7465295676120681,
"learning_rate": 1.1293111163260639e-05,
"loss": 11.1938,
"num_tokens": 8700212.0,
"step": 1220
},
{
"epoch": 0.8532494758909853,
"grad_norm": 0.670212726845832,
"learning_rate": 1.118885175581451e-05,
"loss": 11.3063,
"num_tokens": 8707503.0,
"step": 1221
},
{
"epoch": 0.8539482879105521,
"grad_norm": 0.682152300598347,
"learning_rate": 1.1085047323975173e-05,
"loss": 11.2127,
"num_tokens": 8715062.0,
"step": 1222
},
{
"epoch": 0.8546470999301188,
"grad_norm": 0.695908771897953,
"learning_rate": 1.0981698399528151e-05,
"loss": 11.3079,
"num_tokens": 8722494.0,
"step": 1223
},
{
"epoch": 0.8553459119496856,
"grad_norm": 0.6457103617835629,
"learning_rate": 1.0878805511925438e-05,
"loss": 11.2127,
"num_tokens": 8730292.0,
"step": 1224
},
{
"epoch": 0.8560447239692522,
"grad_norm": 0.6716738001376975,
"learning_rate": 1.0776369188282775e-05,
"loss": 11.4094,
"num_tokens": 8737792.0,
"step": 1225
},
{
"epoch": 0.856743535988819,
"grad_norm": 0.651764381193494,
"learning_rate": 1.0674389953376928e-05,
"loss": 11.2051,
"num_tokens": 8745484.0,
"step": 1226
},
{
"epoch": 0.8574423480083857,
"grad_norm": 0.6740530141860219,
"learning_rate": 1.0572868329643027e-05,
"loss": 11.1321,
"num_tokens": 8752746.0,
"step": 1227
},
{
"epoch": 0.8581411600279525,
"grad_norm": 0.7681446924106794,
"learning_rate": 1.0471804837171916e-05,
"loss": 11.4142,
"num_tokens": 8759307.0,
"step": 1228
},
{
"epoch": 0.8588399720475192,
"grad_norm": 0.6639268473439438,
"learning_rate": 1.0371199993707392e-05,
"loss": 11.1017,
"num_tokens": 8766894.0,
"step": 1229
},
{
"epoch": 0.859538784067086,
"grad_norm": 0.7151411365264585,
"learning_rate": 1.027105431464368e-05,
"loss": 11.2614,
"num_tokens": 8773818.0,
"step": 1230
},
{
"epoch": 0.8602375960866527,
"grad_norm": 0.680537812110416,
"learning_rate": 1.0171368313022677e-05,
"loss": 11.1653,
"num_tokens": 8781325.0,
"step": 1231
},
{
"epoch": 0.8609364081062194,
"grad_norm": 0.7086824988570711,
"learning_rate": 1.0072142499531344e-05,
"loss": 11.396,
"num_tokens": 8788576.0,
"step": 1232
},
{
"epoch": 0.8616352201257862,
"grad_norm": 0.7276519344708898,
"learning_rate": 9.973377382499227e-06,
"loss": 11.3805,
"num_tokens": 8794997.0,
"step": 1233
},
{
"epoch": 0.8623340321453529,
"grad_norm": 0.6616315480835158,
"learning_rate": 9.875073467895634e-06,
"loss": 11.2051,
"num_tokens": 8802443.0,
"step": 1234
},
{
"epoch": 0.8630328441649197,
"grad_norm": 0.6304509355340976,
"learning_rate": 9.777231259327212e-06,
"loss": 11.1513,
"num_tokens": 8810645.0,
"step": 1235
},
{
"epoch": 0.8637316561844863,
"grad_norm": 0.6426270503509127,
"learning_rate": 9.679851258035277e-06,
"loss": 11.1122,
"num_tokens": 8818550.0,
"step": 1236
},
{
"epoch": 0.8644304682040531,
"grad_norm": 0.6854294622483896,
"learning_rate": 9.582933962893293e-06,
"loss": 11.1728,
"num_tokens": 8825989.0,
"step": 1237
},
{
"epoch": 0.8651292802236199,
"grad_norm": 0.6466231934487748,
"learning_rate": 9.48647987040433e-06,
"loss": 11.2237,
"num_tokens": 8833884.0,
"step": 1238
},
{
"epoch": 0.8658280922431866,
"grad_norm": 0.7866666947672692,
"learning_rate": 9.390489474698439e-06,
"loss": 11.3251,
"num_tokens": 8840327.0,
"step": 1239
},
{
"epoch": 0.8665269042627534,
"grad_norm": 0.6904961943465432,
"learning_rate": 9.294963267530176e-06,
"loss": 11.3618,
"num_tokens": 8847713.0,
"step": 1240
},
{
"epoch": 0.86722571628232,
"grad_norm": 0.658618408429087,
"learning_rate": 9.19990173827615e-06,
"loss": 11.1388,
"num_tokens": 8855278.0,
"step": 1241
},
{
"epoch": 0.8679245283018868,
"grad_norm": 0.7534943781371122,
"learning_rate": 9.105305373932338e-06,
"loss": 11.1924,
"num_tokens": 8862156.0,
"step": 1242
},
{
"epoch": 0.8686233403214535,
"grad_norm": 0.6787156336949429,
"learning_rate": 9.01117465911181e-06,
"loss": 11.2441,
"num_tokens": 8869458.0,
"step": 1243
},
{
"epoch": 0.8693221523410203,
"grad_norm": 0.7501279068444608,
"learning_rate": 8.917510076042057e-06,
"loss": 11.3719,
"num_tokens": 8875858.0,
"step": 1244
},
{
"epoch": 0.870020964360587,
"grad_norm": 0.7379674425125939,
"learning_rate": 8.824312104562615e-06,
"loss": 11.2014,
"num_tokens": 8882524.0,
"step": 1245
},
{
"epoch": 0.8707197763801537,
"grad_norm": 0.6693741919269287,
"learning_rate": 8.731581222122587e-06,
"loss": 11.3032,
"num_tokens": 8890194.0,
"step": 1246
},
{
"epoch": 0.8714185883997205,
"grad_norm": 0.6794304721197457,
"learning_rate": 8.639317903778189e-06,
"loss": 11.2264,
"num_tokens": 8897718.0,
"step": 1247
},
{
"epoch": 0.8721174004192872,
"grad_norm": 0.6983883664214291,
"learning_rate": 8.547522622190385e-06,
"loss": 11.112,
"num_tokens": 8904932.0,
"step": 1248
},
{
"epoch": 0.872816212438854,
"grad_norm": 0.7184755993502536,
"learning_rate": 8.45619584762235e-06,
"loss": 11.4004,
"num_tokens": 8912001.0,
"step": 1249
},
{
"epoch": 0.8735150244584207,
"grad_norm": 0.685772655591201,
"learning_rate": 8.365338047937121e-06,
"loss": 11.0864,
"num_tokens": 8919388.0,
"step": 1250
},
{
"epoch": 0.8742138364779874,
"grad_norm": 0.713433613357652,
"learning_rate": 8.274949688595224e-06,
"loss": 11.4927,
"num_tokens": 8926115.0,
"step": 1251
},
{
"epoch": 0.8749126484975541,
"grad_norm": 0.796288755617905,
"learning_rate": 8.185031232652251e-06,
"loss": 11.3539,
"num_tokens": 8932498.0,
"step": 1252
},
{
"epoch": 0.8756114605171209,
"grad_norm": 0.7317766316505643,
"learning_rate": 8.095583140756468e-06,
"loss": 11.3538,
"num_tokens": 8939059.0,
"step": 1253
},
{
"epoch": 0.8763102725366876,
"grad_norm": 0.7196340925972718,
"learning_rate": 8.006605871146577e-06,
"loss": 11.2812,
"num_tokens": 8946032.0,
"step": 1254
},
{
"epoch": 0.8770090845562544,
"grad_norm": 0.6933864422218876,
"learning_rate": 7.918099879649144e-06,
"loss": 11.3292,
"num_tokens": 8953182.0,
"step": 1255
},
{
"epoch": 0.8777078965758212,
"grad_norm": 0.7351379035469031,
"learning_rate": 7.830065619676518e-06,
"loss": 11.1246,
"num_tokens": 8959948.0,
"step": 1256
},
{
"epoch": 0.8784067085953878,
"grad_norm": 0.6544570994881753,
"learning_rate": 7.742503542224334e-06,
"loss": 11.2429,
"num_tokens": 8967644.0,
"step": 1257
},
{
"epoch": 0.8791055206149546,
"grad_norm": 0.6641255330980453,
"learning_rate": 7.65541409586924e-06,
"loss": 11.267,
"num_tokens": 8975384.0,
"step": 1258
},
{
"epoch": 0.8798043326345213,
"grad_norm": 0.8045043710675989,
"learning_rate": 7.568797726766686e-06,
"loss": 11.1368,
"num_tokens": 8981552.0,
"step": 1259
},
{
"epoch": 0.8805031446540881,
"grad_norm": 0.6704571245484808,
"learning_rate": 7.482654878648465e-06,
"loss": 11.3728,
"num_tokens": 8989062.0,
"step": 1260
},
{
"epoch": 0.8812019566736548,
"grad_norm": 0.8369669539346279,
"learning_rate": 7.396985992820648e-06,
"loss": 11.2803,
"num_tokens": 8996294.0,
"step": 1261
},
{
"epoch": 0.8819007686932215,
"grad_norm": 0.6847794044512928,
"learning_rate": 7.311791508161159e-06,
"loss": 11.3536,
"num_tokens": 9003435.0,
"step": 1262
},
{
"epoch": 0.8825995807127882,
"grad_norm": 0.6571735057337356,
"learning_rate": 7.227071861117562e-06,
"loss": 11.4174,
"num_tokens": 9011341.0,
"step": 1263
},
{
"epoch": 0.883298392732355,
"grad_norm": 0.7245916491525788,
"learning_rate": 7.14282748570495e-06,
"loss": 11.2201,
"num_tokens": 9017863.0,
"step": 1264
},
{
"epoch": 0.8839972047519218,
"grad_norm": 0.7310401125961942,
"learning_rate": 7.059058813503483e-06,
"loss": 11.4414,
"num_tokens": 9024474.0,
"step": 1265
},
{
"epoch": 0.8846960167714885,
"grad_norm": 0.6450501711377958,
"learning_rate": 6.975766273656425e-06,
"loss": 11.2476,
"num_tokens": 9032345.0,
"step": 1266
},
{
"epoch": 0.8853948287910552,
"grad_norm": 0.7556755113831584,
"learning_rate": 6.892950292867784e-06,
"loss": 11.3764,
"num_tokens": 9038850.0,
"step": 1267
},
{
"epoch": 0.8860936408106219,
"grad_norm": 0.7125713611966051,
"learning_rate": 6.810611295400171e-06,
"loss": 11.3624,
"num_tokens": 9045840.0,
"step": 1268
},
{
"epoch": 0.8867924528301887,
"grad_norm": 0.6642280207143011,
"learning_rate": 6.728749703072679e-06,
"loss": 11.2514,
"num_tokens": 9053258.0,
"step": 1269
},
{
"epoch": 0.8874912648497554,
"grad_norm": 0.6771501155957508,
"learning_rate": 6.647365935258642e-06,
"loss": 11.2015,
"num_tokens": 9060449.0,
"step": 1270
},
{
"epoch": 0.8881900768693222,
"grad_norm": 0.7024388591874622,
"learning_rate": 6.56646040888349e-06,
"loss": 11.3443,
"num_tokens": 9067366.0,
"step": 1271
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.6765742708153049,
"learning_rate": 6.48603353842272e-06,
"loss": 11.2577,
"num_tokens": 9074722.0,
"step": 1272
},
{
"epoch": 0.8895877009084556,
"grad_norm": 0.7261272982152684,
"learning_rate": 6.406085735899625e-06,
"loss": 11.2727,
"num_tokens": 9082068.0,
"step": 1273
},
{
"epoch": 0.8902865129280224,
"grad_norm": 0.6735110971588754,
"learning_rate": 6.326617410883295e-06,
"loss": 11.4339,
"num_tokens": 9089269.0,
"step": 1274
},
{
"epoch": 0.8909853249475891,
"grad_norm": 0.7182901637403895,
"learning_rate": 6.247628970486463e-06,
"loss": 11.085,
"num_tokens": 9096281.0,
"step": 1275
},
{
"epoch": 0.8916841369671559,
"grad_norm": 0.6411503497109853,
"learning_rate": 6.169120819363405e-06,
"loss": 11.3368,
"num_tokens": 9103954.0,
"step": 1276
},
{
"epoch": 0.8923829489867225,
"grad_norm": 0.7214606675707624,
"learning_rate": 6.091093359707977e-06,
"loss": 11.26,
"num_tokens": 9110403.0,
"step": 1277
},
{
"epoch": 0.8930817610062893,
"grad_norm": 0.6884291008923099,
"learning_rate": 6.013546991251373e-06,
"loss": 11.0697,
"num_tokens": 9117361.0,
"step": 1278
},
{
"epoch": 0.893780573025856,
"grad_norm": 0.7410343770266834,
"learning_rate": 5.936482111260278e-06,
"loss": 11.4048,
"num_tokens": 9123889.0,
"step": 1279
},
{
"epoch": 0.8944793850454228,
"grad_norm": 0.6910189149126138,
"learning_rate": 5.859899114534661e-06,
"loss": 11.0736,
"num_tokens": 9130750.0,
"step": 1280
},
{
"epoch": 0.8951781970649895,
"grad_norm": 0.6997111425088132,
"learning_rate": 5.783798393405826e-06,
"loss": 11.4646,
"num_tokens": 9137972.0,
"step": 1281
},
{
"epoch": 0.8958770090845563,
"grad_norm": 0.7381270749567366,
"learning_rate": 5.708180337734448e-06,
"loss": 11.3544,
"num_tokens": 9144975.0,
"step": 1282
},
{
"epoch": 0.896575821104123,
"grad_norm": 0.7016469624483329,
"learning_rate": 5.633045334908493e-06,
"loss": 11.416,
"num_tokens": 9152166.0,
"step": 1283
},
{
"epoch": 0.8972746331236897,
"grad_norm": 0.7555515485608866,
"learning_rate": 5.5583937698412856e-06,
"loss": 11.3033,
"num_tokens": 9158905.0,
"step": 1284
},
{
"epoch": 0.8979734451432565,
"grad_norm": 0.754593708430779,
"learning_rate": 5.4842260249694964e-06,
"loss": 11.5423,
"num_tokens": 9165569.0,
"step": 1285
},
{
"epoch": 0.8986722571628232,
"grad_norm": 0.721083897447478,
"learning_rate": 5.410542480251202e-06,
"loss": 11.3485,
"num_tokens": 9172201.0,
"step": 1286
},
{
"epoch": 0.89937106918239,
"grad_norm": 0.7403874761653653,
"learning_rate": 5.337343513164006e-06,
"loss": 11.1843,
"num_tokens": 9179084.0,
"step": 1287
},
{
"epoch": 0.9000698812019566,
"grad_norm": 0.6781928166122254,
"learning_rate": 5.264629498702967e-06,
"loss": 11.2702,
"num_tokens": 9186520.0,
"step": 1288
},
{
"epoch": 0.9007686932215234,
"grad_norm": 0.64789553108461,
"learning_rate": 5.192400809378783e-06,
"loss": 11.1842,
"num_tokens": 9193927.0,
"step": 1289
},
{
"epoch": 0.9014675052410901,
"grad_norm": 0.638576282772883,
"learning_rate": 5.120657815215879e-06,
"loss": 11.2144,
"num_tokens": 9201756.0,
"step": 1290
},
{
"epoch": 0.9021663172606569,
"grad_norm": 0.6516712972261183,
"learning_rate": 5.0494008837504214e-06,
"loss": 11.1786,
"num_tokens": 9209398.0,
"step": 1291
},
{
"epoch": 0.9028651292802237,
"grad_norm": 0.7567436281029121,
"learning_rate": 4.978630380028582e-06,
"loss": 11.2843,
"num_tokens": 9215890.0,
"step": 1292
},
{
"epoch": 0.9035639412997903,
"grad_norm": 0.7830577964462887,
"learning_rate": 4.908346666604502e-06,
"loss": 11.4464,
"num_tokens": 9222461.0,
"step": 1293
},
{
"epoch": 0.9042627533193571,
"grad_norm": 0.7351201872984475,
"learning_rate": 4.8385501035385746e-06,
"loss": 11.3842,
"num_tokens": 9229085.0,
"step": 1294
},
{
"epoch": 0.9049615653389238,
"grad_norm": 0.6510585165795112,
"learning_rate": 4.769241048395512e-06,
"loss": 11.1701,
"num_tokens": 9236669.0,
"step": 1295
},
{
"epoch": 0.9056603773584906,
"grad_norm": 0.664145384036073,
"learning_rate": 4.700419856242555e-06,
"loss": 11.1899,
"num_tokens": 9243791.0,
"step": 1296
},
{
"epoch": 0.9063591893780573,
"grad_norm": 0.7537789034846496,
"learning_rate": 4.632086879647635e-06,
"loss": 11.4093,
"num_tokens": 9250468.0,
"step": 1297
},
{
"epoch": 0.907058001397624,
"grad_norm": 0.6644394086425081,
"learning_rate": 4.564242468677615e-06,
"loss": 11.1755,
"num_tokens": 9257850.0,
"step": 1298
},
{
"epoch": 0.9077568134171907,
"grad_norm": 0.6585007654131836,
"learning_rate": 4.496886970896396e-06,
"loss": 11.304,
"num_tokens": 9265236.0,
"step": 1299
},
{
"epoch": 0.9084556254367575,
"grad_norm": 0.6909557159895615,
"learning_rate": 4.430020731363271e-06,
"loss": 11.2188,
"num_tokens": 9272210.0,
"step": 1300
},
{
"epoch": 0.9091544374563243,
"grad_norm": 0.7195362441131871,
"learning_rate": 4.3636440926310144e-06,
"loss": 11.5416,
"num_tokens": 9279337.0,
"step": 1301
},
{
"epoch": 0.909853249475891,
"grad_norm": 0.7125110991664391,
"learning_rate": 4.2977573947442175e-06,
"loss": 11.2543,
"num_tokens": 9286092.0,
"step": 1302
},
{
"epoch": 0.9105520614954578,
"grad_norm": 0.6935508566906836,
"learning_rate": 4.232360975237571e-06,
"loss": 11.2606,
"num_tokens": 9292888.0,
"step": 1303
},
{
"epoch": 0.9112508735150244,
"grad_norm": 0.695239157260203,
"learning_rate": 4.167455169134027e-06,
"loss": 11.3306,
"num_tokens": 9299710.0,
"step": 1304
},
{
"epoch": 0.9119496855345912,
"grad_norm": 0.7177219647398039,
"learning_rate": 4.103040308943195e-06,
"loss": 11.3787,
"num_tokens": 9306349.0,
"step": 1305
},
{
"epoch": 0.9126484975541579,
"grad_norm": 0.6631627285340865,
"learning_rate": 4.039116724659564e-06,
"loss": 11.3888,
"num_tokens": 9313892.0,
"step": 1306
},
{
"epoch": 0.9133473095737247,
"grad_norm": 0.7192009063137447,
"learning_rate": 3.975684743760832e-06,
"loss": 11.3041,
"num_tokens": 9320784.0,
"step": 1307
},
{
"epoch": 0.9140461215932913,
"grad_norm": 0.6206047815154435,
"learning_rate": 3.91274469120626e-06,
"loss": 11.075,
"num_tokens": 9328759.0,
"step": 1308
},
{
"epoch": 0.9147449336128581,
"grad_norm": 0.7157002783746432,
"learning_rate": 3.850296889434968e-06,
"loss": 11.6899,
"num_tokens": 9335584.0,
"step": 1309
},
{
"epoch": 0.9154437456324249,
"grad_norm": 0.63380899392185,
"learning_rate": 3.788341658364314e-06,
"loss": 11.2001,
"num_tokens": 9343277.0,
"step": 1310
},
{
"epoch": 0.9161425576519916,
"grad_norm": 0.6594255092883604,
"learning_rate": 3.726879315388199e-06,
"loss": 11.4582,
"num_tokens": 9350951.0,
"step": 1311
},
{
"epoch": 0.9168413696715584,
"grad_norm": 0.7156352255325785,
"learning_rate": 3.665910175375498e-06,
"loss": 11.2162,
"num_tokens": 9357659.0,
"step": 1312
},
{
"epoch": 0.9175401816911251,
"grad_norm": 0.6425872585900699,
"learning_rate": 3.6054345506684627e-06,
"loss": 11.2544,
"num_tokens": 9365388.0,
"step": 1313
},
{
"epoch": 0.9182389937106918,
"grad_norm": 0.6714524114625421,
"learning_rate": 3.5454527510810352e-06,
"loss": 11.1915,
"num_tokens": 9372586.0,
"step": 1314
},
{
"epoch": 0.9189378057302585,
"grad_norm": 0.6582728677450521,
"learning_rate": 3.485965083897347e-06,
"loss": 11.1907,
"num_tokens": 9380241.0,
"step": 1315
},
{
"epoch": 0.9196366177498253,
"grad_norm": 0.7159032392269566,
"learning_rate": 3.426971853870109e-06,
"loss": 11.4437,
"num_tokens": 9387292.0,
"step": 1316
},
{
"epoch": 0.9203354297693921,
"grad_norm": 0.6942298939290196,
"learning_rate": 3.3684733632190157e-06,
"loss": 11.1795,
"num_tokens": 9394171.0,
"step": 1317
},
{
"epoch": 0.9210342417889588,
"grad_norm": 0.6692411875362031,
"learning_rate": 3.310469911629288e-06,
"loss": 11.2257,
"num_tokens": 9401400.0,
"step": 1318
},
{
"epoch": 0.9217330538085255,
"grad_norm": 0.728672155961115,
"learning_rate": 3.252961796250054e-06,
"loss": 11.1689,
"num_tokens": 9408486.0,
"step": 1319
},
{
"epoch": 0.9224318658280922,
"grad_norm": 0.7355419519065545,
"learning_rate": 3.1959493116928476e-06,
"loss": 11.4436,
"num_tokens": 9415285.0,
"step": 1320
},
{
"epoch": 0.923130677847659,
"grad_norm": 0.7030206494953251,
"learning_rate": 3.1394327500301357e-06,
"loss": 11.3679,
"num_tokens": 9422270.0,
"step": 1321
},
{
"epoch": 0.9238294898672257,
"grad_norm": 0.629823205533704,
"learning_rate": 3.0834124007937614e-06,
"loss": 11.3484,
"num_tokens": 9430223.0,
"step": 1322
},
{
"epoch": 0.9245283018867925,
"grad_norm": 0.6789253643905201,
"learning_rate": 3.0278885509735234e-06,
"loss": 11.2971,
"num_tokens": 9437454.0,
"step": 1323
},
{
"epoch": 0.9252271139063591,
"grad_norm": 0.6636854562456178,
"learning_rate": 2.9728614850156653e-06,
"loss": 11.1163,
"num_tokens": 9444734.0,
"step": 1324
},
{
"epoch": 0.9259259259259259,
"grad_norm": 0.7344693956693098,
"learning_rate": 2.9183314848214127e-06,
"loss": 11.3255,
"num_tokens": 9451233.0,
"step": 1325
},
{
"epoch": 0.9266247379454927,
"grad_norm": 0.6770150273794775,
"learning_rate": 2.864298829745571e-06,
"loss": 11.0587,
"num_tokens": 9458292.0,
"step": 1326
},
{
"epoch": 0.9273235499650594,
"grad_norm": 0.6755839488912377,
"learning_rate": 2.8107637965950506e-06,
"loss": 11.1855,
"num_tokens": 9465760.0,
"step": 1327
},
{
"epoch": 0.9280223619846262,
"grad_norm": 0.6907883258293367,
"learning_rate": 2.7577266596274576e-06,
"loss": 11.2504,
"num_tokens": 9473090.0,
"step": 1328
},
{
"epoch": 0.9287211740041929,
"grad_norm": 0.6894014385780622,
"learning_rate": 2.7051876905497375e-06,
"loss": 11.3714,
"num_tokens": 9480091.0,
"step": 1329
},
{
"epoch": 0.9294199860237596,
"grad_norm": 0.6507226401997432,
"learning_rate": 2.6531471585167e-06,
"loss": 11.3168,
"num_tokens": 9487848.0,
"step": 1330
},
{
"epoch": 0.9301187980433263,
"grad_norm": 0.6634040035460618,
"learning_rate": 2.6016053301297196e-06,
"loss": 11.1478,
"num_tokens": 9495032.0,
"step": 1331
},
{
"epoch": 0.9308176100628931,
"grad_norm": 0.6248126259198482,
"learning_rate": 2.5505624694353024e-06,
"loss": 11.1628,
"num_tokens": 9502796.0,
"step": 1332
},
{
"epoch": 0.9315164220824598,
"grad_norm": 0.7223781912313146,
"learning_rate": 2.5000188379237786e-06,
"loss": 11.2473,
"num_tokens": 9509344.0,
"step": 1333
},
{
"epoch": 0.9322152341020266,
"grad_norm": 0.6968748402634183,
"learning_rate": 2.4499746945279566e-06,
"loss": 11.227,
"num_tokens": 9516077.0,
"step": 1334
},
{
"epoch": 0.9329140461215933,
"grad_norm": 0.6518545278329891,
"learning_rate": 2.4004302956217804e-06,
"loss": 11.2451,
"num_tokens": 9523359.0,
"step": 1335
},
{
"epoch": 0.93361285814116,
"grad_norm": 0.7049583629463331,
"learning_rate": 2.3513858950190204e-06,
"loss": 11.2872,
"num_tokens": 9530186.0,
"step": 1336
},
{
"epoch": 0.9343116701607268,
"grad_norm": 0.7118301258107756,
"learning_rate": 2.302841743971995e-06,
"loss": 11.0207,
"num_tokens": 9537136.0,
"step": 1337
},
{
"epoch": 0.9350104821802935,
"grad_norm": 0.6928653342414067,
"learning_rate": 2.2547980911702404e-06,
"loss": 11.3115,
"num_tokens": 9544223.0,
"step": 1338
},
{
"epoch": 0.9357092941998603,
"grad_norm": 0.7236335982996023,
"learning_rate": 2.2072551827392983e-06,
"loss": 11.3392,
"num_tokens": 9550920.0,
"step": 1339
},
{
"epoch": 0.9364081062194269,
"grad_norm": 0.6465425844125828,
"learning_rate": 2.1602132622393746e-06,
"loss": 11.2663,
"num_tokens": 9558586.0,
"step": 1340
},
{
"epoch": 0.9371069182389937,
"grad_norm": 0.658991959593526,
"learning_rate": 2.1136725706641712e-06,
"loss": 11.3773,
"num_tokens": 9566214.0,
"step": 1341
},
{
"epoch": 0.9378057302585604,
"grad_norm": 0.6948550756585189,
"learning_rate": 2.0676333464396126e-06,
"loss": 11.2736,
"num_tokens": 9573631.0,
"step": 1342
},
{
"epoch": 0.9385045422781272,
"grad_norm": 0.7206740338733898,
"learning_rate": 2.0220958254225984e-06,
"loss": 11.3452,
"num_tokens": 9580814.0,
"step": 1343
},
{
"epoch": 0.939203354297694,
"grad_norm": 0.7268069168403607,
"learning_rate": 1.977060240899864e-06,
"loss": 11.2923,
"num_tokens": 9587261.0,
"step": 1344
},
{
"epoch": 0.9399021663172606,
"grad_norm": 0.7172014463716784,
"learning_rate": 1.932526823586722e-06,
"loss": 11.3546,
"num_tokens": 9593973.0,
"step": 1345
},
{
"epoch": 0.9406009783368274,
"grad_norm": 0.623418404084086,
"learning_rate": 1.8884958016259113e-06,
"loss": 11.2887,
"num_tokens": 9601821.0,
"step": 1346
},
{
"epoch": 0.9412997903563941,
"grad_norm": 0.6616585012666008,
"learning_rate": 1.844967400586428e-06,
"loss": 11.2776,
"num_tokens": 9609343.0,
"step": 1347
},
{
"epoch": 0.9419986023759609,
"grad_norm": 0.6433007452971647,
"learning_rate": 1.8019418434623404e-06,
"loss": 11.1876,
"num_tokens": 9616602.0,
"step": 1348
},
{
"epoch": 0.9426974143955276,
"grad_norm": 0.6822594858991043,
"learning_rate": 1.7594193506716983e-06,
"loss": 11.2809,
"num_tokens": 9623622.0,
"step": 1349
},
{
"epoch": 0.9433962264150944,
"grad_norm": 0.6684722637362679,
"learning_rate": 1.7174001400553586e-06,
"loss": 11.3421,
"num_tokens": 9631105.0,
"step": 1350
},
{
"epoch": 0.944095038434661,
"grad_norm": 0.6429394633148133,
"learning_rate": 1.6758844268758843e-06,
"loss": 11.3945,
"num_tokens": 9638663.0,
"step": 1351
},
{
"epoch": 0.9447938504542278,
"grad_norm": 0.5926480868121347,
"learning_rate": 1.634872423816458e-06,
"loss": 11.1168,
"num_tokens": 9646957.0,
"step": 1352
},
{
"epoch": 0.9454926624737946,
"grad_norm": 0.7066366283958508,
"learning_rate": 1.5943643409797594e-06,
"loss": 11.2229,
"num_tokens": 9653670.0,
"step": 1353
},
{
"epoch": 0.9461914744933613,
"grad_norm": 0.6611735635710152,
"learning_rate": 1.5543603858869215e-06,
"loss": 11.2078,
"num_tokens": 9660888.0,
"step": 1354
},
{
"epoch": 0.9468902865129281,
"grad_norm": 0.6742560705489566,
"learning_rate": 1.5148607634764446e-06,
"loss": 11.2851,
"num_tokens": 9668331.0,
"step": 1355
},
{
"epoch": 0.9475890985324947,
"grad_norm": 0.702592812073591,
"learning_rate": 1.475865676103161e-06,
"loss": 11.2943,
"num_tokens": 9675259.0,
"step": 1356
},
{
"epoch": 0.9482879105520615,
"grad_norm": 0.6570155584828766,
"learning_rate": 1.4373753235371823e-06,
"loss": 10.9385,
"num_tokens": 9682674.0,
"step": 1357
},
{
"epoch": 0.9489867225716282,
"grad_norm": 0.7784712864060985,
"learning_rate": 1.3993899029628997e-06,
"loss": 11.3884,
"num_tokens": 9689182.0,
"step": 1358
},
{
"epoch": 0.949685534591195,
"grad_norm": 0.6920868766511811,
"learning_rate": 1.3619096089779293e-06,
"loss": 11.4583,
"num_tokens": 9696437.0,
"step": 1359
},
{
"epoch": 0.9503843466107617,
"grad_norm": 0.7247008722668327,
"learning_rate": 1.3249346335922007e-06,
"loss": 11.4813,
"num_tokens": 9703396.0,
"step": 1360
},
{
"epoch": 0.9510831586303284,
"grad_norm": 0.6632602910524646,
"learning_rate": 1.2884651662268709e-06,
"loss": 11.3547,
"num_tokens": 9711059.0,
"step": 1361
},
{
"epoch": 0.9517819706498952,
"grad_norm": 0.6710435983883922,
"learning_rate": 1.2525013937134122e-06,
"loss": 11.3387,
"num_tokens": 9718400.0,
"step": 1362
},
{
"epoch": 0.9524807826694619,
"grad_norm": 0.664792473697077,
"learning_rate": 1.2170435002926694e-06,
"loss": 11.1944,
"num_tokens": 9725645.0,
"step": 1363
},
{
"epoch": 0.9531795946890287,
"grad_norm": 0.6707698174727578,
"learning_rate": 1.1820916676138382e-06,
"loss": 11.2262,
"num_tokens": 9733005.0,
"step": 1364
},
{
"epoch": 0.9538784067085954,
"grad_norm": 0.6688979482961926,
"learning_rate": 1.147646074733655e-06,
"loss": 11.2677,
"num_tokens": 9740218.0,
"step": 1365
},
{
"epoch": 0.9545772187281621,
"grad_norm": 0.7433455767451992,
"learning_rate": 1.1137068981153632e-06,
"loss": 11.4951,
"num_tokens": 9746821.0,
"step": 1366
},
{
"epoch": 0.9552760307477288,
"grad_norm": 0.7175584827821752,
"learning_rate": 1.0802743116278714e-06,
"loss": 11.2647,
"num_tokens": 9753374.0,
"step": 1367
},
{
"epoch": 0.9559748427672956,
"grad_norm": 0.6933785565534558,
"learning_rate": 1.0473484865448525e-06,
"loss": 11.387,
"num_tokens": 9760505.0,
"step": 1368
},
{
"epoch": 0.9566736547868623,
"grad_norm": 0.6647102154219586,
"learning_rate": 1.014929591543845e-06,
"loss": 11.3239,
"num_tokens": 9767972.0,
"step": 1369
},
{
"epoch": 0.9573724668064291,
"grad_norm": 0.7171114285326375,
"learning_rate": 9.830177927054428e-07,
"loss": 11.2421,
"num_tokens": 9774744.0,
"step": 1370
},
{
"epoch": 0.9580712788259959,
"grad_norm": 0.6753844797667017,
"learning_rate": 9.516132535123846e-07,
"loss": 11.0581,
"num_tokens": 9782050.0,
"step": 1371
},
{
"epoch": 0.9587700908455625,
"grad_norm": 0.6594805435739066,
"learning_rate": 9.207161348487315e-07,
"loss": 11.2986,
"num_tokens": 9789435.0,
"step": 1372
},
{
"epoch": 0.9594689028651293,
"grad_norm": 0.6219305497890216,
"learning_rate": 8.903265949990691e-07,
"loss": 11.2472,
"num_tokens": 9797498.0,
"step": 1373
},
{
"epoch": 0.960167714884696,
"grad_norm": 0.6797824852084828,
"learning_rate": 8.604447896476852e-07,
"loss": 11.2204,
"num_tokens": 9805014.0,
"step": 1374
},
{
"epoch": 0.9608665269042628,
"grad_norm": 0.637072288456489,
"learning_rate": 8.310708718777371e-07,
"loss": 11.0226,
"num_tokens": 9812890.0,
"step": 1375
},
{
"epoch": 0.9615653389238294,
"grad_norm": 0.6883492395222,
"learning_rate": 8.022049921705299e-07,
"loss": 11.1737,
"num_tokens": 9819758.0,
"step": 1376
},
{
"epoch": 0.9622641509433962,
"grad_norm": 0.6679558355010241,
"learning_rate": 7.73847298404684e-07,
"loss": 11.0855,
"num_tokens": 9826786.0,
"step": 1377
},
{
"epoch": 0.9629629629629629,
"grad_norm": 0.6523416128412312,
"learning_rate": 7.459979358554248e-07,
"loss": 11.2854,
"num_tokens": 9834179.0,
"step": 1378
},
{
"epoch": 0.9636617749825297,
"grad_norm": 0.6761703126788413,
"learning_rate": 7.186570471937937e-07,
"loss": 11.4241,
"num_tokens": 9841706.0,
"step": 1379
},
{
"epoch": 0.9643605870020965,
"grad_norm": 0.6539490861725904,
"learning_rate": 6.918247724859939e-07,
"loss": 11.0869,
"num_tokens": 9849180.0,
"step": 1380
},
{
"epoch": 0.9650593990216632,
"grad_norm": 0.6627942977748735,
"learning_rate": 6.655012491925683e-07,
"loss": 11.3332,
"num_tokens": 9856546.0,
"step": 1381
},
{
"epoch": 0.9657582110412299,
"grad_norm": 0.7525079583067344,
"learning_rate": 6.396866121677559e-07,
"loss": 11.485,
"num_tokens": 9863096.0,
"step": 1382
},
{
"epoch": 0.9664570230607966,
"grad_norm": 0.666826182150616,
"learning_rate": 6.143809936588363e-07,
"loss": 11.1335,
"num_tokens": 9870255.0,
"step": 1383
},
{
"epoch": 0.9671558350803634,
"grad_norm": 0.6242640875889792,
"learning_rate": 5.895845233053643e-07,
"loss": 11.2859,
"num_tokens": 9878236.0,
"step": 1384
},
{
"epoch": 0.9678546470999301,
"grad_norm": 0.7272158788230525,
"learning_rate": 5.652973281385588e-07,
"loss": 11.3917,
"num_tokens": 9885146.0,
"step": 1385
},
{
"epoch": 0.9685534591194969,
"grad_norm": 0.6972412481254419,
"learning_rate": 5.415195325806699e-07,
"loss": 11.2604,
"num_tokens": 9892008.0,
"step": 1386
},
{
"epoch": 0.9692522711390635,
"grad_norm": 0.6840355027864292,
"learning_rate": 5.182512584443022e-07,
"loss": 11.3559,
"num_tokens": 9899023.0,
"step": 1387
},
{
"epoch": 0.9699510831586303,
"grad_norm": 0.6604927934689941,
"learning_rate": 4.954926249317815e-07,
"loss": 11.0894,
"num_tokens": 9906287.0,
"step": 1388
},
{
"epoch": 0.9706498951781971,
"grad_norm": 0.6550659928475684,
"learning_rate": 4.732437486345886e-07,
"loss": 10.9861,
"num_tokens": 9913471.0,
"step": 1389
},
{
"epoch": 0.9713487071977638,
"grad_norm": 0.6731044015412156,
"learning_rate": 4.515047435327491e-07,
"loss": 11.3375,
"num_tokens": 9920776.0,
"step": 1390
},
{
"epoch": 0.9720475192173306,
"grad_norm": 0.6886371291373153,
"learning_rate": 4.3027572099422207e-07,
"loss": 11.3041,
"num_tokens": 9927914.0,
"step": 1391
},
{
"epoch": 0.9727463312368972,
"grad_norm": 0.7315035720036384,
"learning_rate": 4.0955678977436797e-07,
"loss": 11.3239,
"num_tokens": 9934401.0,
"step": 1392
},
{
"epoch": 0.973445143256464,
"grad_norm": 0.6731364179858964,
"learning_rate": 3.893480560153484e-07,
"loss": 11.1288,
"num_tokens": 9941705.0,
"step": 1393
},
{
"epoch": 0.9741439552760307,
"grad_norm": 0.6600863194244329,
"learning_rate": 3.6964962324561593e-07,
"loss": 11.2683,
"num_tokens": 9949413.0,
"step": 1394
},
{
"epoch": 0.9748427672955975,
"grad_norm": 0.6737648893244508,
"learning_rate": 3.504615923793919e-07,
"loss": 11.0837,
"num_tokens": 9956435.0,
"step": 1395
},
{
"epoch": 0.9755415793151643,
"grad_norm": 0.7145328987881118,
"learning_rate": 3.317840617160894e-07,
"loss": 11.2672,
"num_tokens": 9963267.0,
"step": 1396
},
{
"epoch": 0.976240391334731,
"grad_norm": 0.6822610524050463,
"learning_rate": 3.136171269399024e-07,
"loss": 10.9777,
"num_tokens": 9970795.0,
"step": 1397
},
{
"epoch": 0.9769392033542977,
"grad_norm": 0.7139750346598414,
"learning_rate": 2.959608811192283e-07,
"loss": 11.2859,
"num_tokens": 9977599.0,
"step": 1398
},
{
"epoch": 0.9776380153738644,
"grad_norm": 0.6629186760012954,
"learning_rate": 2.7881541470623494e-07,
"loss": 11.3439,
"num_tokens": 9985126.0,
"step": 1399
},
{
"epoch": 0.9783368273934312,
"grad_norm": 0.7021437040550742,
"learning_rate": 2.6218081553638364e-07,
"loss": 11.074,
"num_tokens": 9991636.0,
"step": 1400
},
{
"epoch": 0.9790356394129979,
"grad_norm": 0.651326390715108,
"learning_rate": 2.4605716882801776e-07,
"loss": 11.3278,
"num_tokens": 9999333.0,
"step": 1401
},
{
"epoch": 0.9797344514325647,
"grad_norm": 0.7077963783884662,
"learning_rate": 2.3044455718185253e-07,
"loss": 11.279,
"num_tokens": 10006127.0,
"step": 1402
},
{
"epoch": 0.9804332634521313,
"grad_norm": 0.7275599228420562,
"learning_rate": 2.153430605806195e-07,
"loss": 11.2103,
"num_tokens": 10012832.0,
"step": 1403
},
{
"epoch": 0.9811320754716981,
"grad_norm": 0.6445139570135618,
"learning_rate": 2.0075275638862246e-07,
"loss": 10.9713,
"num_tokens": 10020535.0,
"step": 1404
},
{
"epoch": 0.9818308874912649,
"grad_norm": 0.7116706126407417,
"learning_rate": 1.8667371935133792e-07,
"loss": 11.4616,
"num_tokens": 10027624.0,
"step": 1405
},
{
"epoch": 0.9825296995108316,
"grad_norm": 0.6841858958472258,
"learning_rate": 1.7310602159505974e-07,
"loss": 11.3032,
"num_tokens": 10034683.0,
"step": 1406
},
{
"epoch": 0.9832285115303984,
"grad_norm": 0.6705286305197254,
"learning_rate": 1.6004973262651047e-07,
"loss": 11.3338,
"num_tokens": 10042053.0,
"step": 1407
},
{
"epoch": 0.983927323549965,
"grad_norm": 0.6730415740783933,
"learning_rate": 1.4750491933247512e-07,
"loss": 11.3956,
"num_tokens": 10049229.0,
"step": 1408
},
{
"epoch": 0.9846261355695318,
"grad_norm": 0.7674497304328863,
"learning_rate": 1.3547164597949026e-07,
"loss": 11.5897,
"num_tokens": 10055821.0,
"step": 1409
},
{
"epoch": 0.9853249475890985,
"grad_norm": 0.6146955427792996,
"learning_rate": 1.2394997421347753e-07,
"loss": 11.3468,
"num_tokens": 10064206.0,
"step": 1410
},
{
"epoch": 0.9860237596086653,
"grad_norm": 0.7186141338407501,
"learning_rate": 1.1293996305946631e-07,
"loss": 11.2113,
"num_tokens": 10070918.0,
"step": 1411
},
{
"epoch": 0.986722571628232,
"grad_norm": 0.6471488770051232,
"learning_rate": 1.0244166892124928e-07,
"loss": 11.2701,
"num_tokens": 10078257.0,
"step": 1412
},
{
"epoch": 0.9874213836477987,
"grad_norm": 0.6966594622997705,
"learning_rate": 9.245514558112733e-08,
"loss": 11.3059,
"num_tokens": 10085142.0,
"step": 1413
},
{
"epoch": 0.9881201956673655,
"grad_norm": 0.7492634726939886,
"learning_rate": 8.298044419962069e-08,
"loss": 11.2983,
"num_tokens": 10091532.0,
"step": 1414
},
{
"epoch": 0.9888190076869322,
"grad_norm": 0.67547464453374,
"learning_rate": 7.401761331521372e-08,
"loss": 11.2074,
"num_tokens": 10098630.0,
"step": 1415
},
{
"epoch": 0.989517819706499,
"grad_norm": 0.6997394087199268,
"learning_rate": 6.556669884408839e-08,
"loss": 11.2439,
"num_tokens": 10105381.0,
"step": 1416
},
{
"epoch": 0.9902166317260657,
"grad_norm": 0.6784044100485935,
"learning_rate": 5.7627744079902235e-08,
"loss": 11.2719,
"num_tokens": 10112328.0,
"step": 1417
},
{
"epoch": 0.9909154437456325,
"grad_norm": 0.7376730133275492,
"learning_rate": 5.0200789693588544e-08,
"loss": 11.3005,
"num_tokens": 10118964.0,
"step": 1418
},
{
"epoch": 0.9916142557651991,
"grad_norm": 0.674112638355877,
"learning_rate": 4.32858737330899e-08,
"loss": 11.0974,
"num_tokens": 10125874.0,
"step": 1419
},
{
"epoch": 0.9923130677847659,
"grad_norm": 0.7551764155567859,
"learning_rate": 3.6883031623224926e-08,
"loss": 11.5891,
"num_tokens": 10132274.0,
"step": 1420
},
{
"epoch": 0.9930118798043326,
"grad_norm": 0.7272923742000015,
"learning_rate": 3.099229616547739e-08,
"loss": 11.3233,
"num_tokens": 10138845.0,
"step": 1421
},
{
"epoch": 0.9937106918238994,
"grad_norm": 0.7015750748072532,
"learning_rate": 2.5613697537818504e-08,
"loss": 11.3946,
"num_tokens": 10145801.0,
"step": 1422
},
{
"epoch": 0.9944095038434662,
"grad_norm": 0.6722209770345767,
"learning_rate": 2.074726329457377e-08,
"loss": 11.2842,
"num_tokens": 10152749.0,
"step": 1423
},
{
"epoch": 0.9951083158630328,
"grad_norm": 0.6390873103140037,
"learning_rate": 1.6393018366278602e-08,
"loss": 11.1965,
"num_tokens": 10160312.0,
"step": 1424
},
{
"epoch": 0.9958071278825996,
"grad_norm": 0.7038247897264213,
"learning_rate": 1.2550985059522902e-08,
"loss": 11.2494,
"num_tokens": 10167099.0,
"step": 1425
},
{
"epoch": 0.9965059399021663,
"grad_norm": 0.6666127532974914,
"learning_rate": 9.221183056895566e-09,
"loss": 11.2175,
"num_tokens": 10174489.0,
"step": 1426
},
{
"epoch": 0.9972047519217331,
"grad_norm": 0.6765466784943449,
"learning_rate": 6.4036294168068335e-09,
"loss": 11.3275,
"num_tokens": 10182102.0,
"step": 1427
},
{
"epoch": 0.9979035639412998,
"grad_norm": 0.6964009190338591,
"learning_rate": 4.0983385734660875e-09,
"loss": 11.1313,
"num_tokens": 10188883.0,
"step": 1428
},
{
"epoch": 0.9986023759608665,
"grad_norm": 0.7265152003293984,
"learning_rate": 2.305322336781934e-09,
"loss": 11.3144,
"num_tokens": 10195672.0,
"step": 1429
},
{
"epoch": 0.9993011879804332,
"grad_norm": 0.6683920838823425,
"learning_rate": 1.0245898922844888e-09,
"loss": 11.445,
"num_tokens": 10203012.0,
"step": 1430
},
{
"epoch": 1.0,
"grad_norm": 0.6789763311122045,
"learning_rate": 2.561478011253726e-10,
"loss": 11.2073,
"num_tokens": 10210317.0,
"step": 1431
},
{
"epoch": 1.0,
"step": 1431,
"total_flos": 1443880778727424.0,
"train_loss": 12.112521223552072,
"train_runtime": 11701.6904,
"train_samples_per_second": 7.829,
"train_steps_per_second": 0.122
}
],
"logging_steps": 1.0,
"max_steps": 1431,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1443880778727424.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}