{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.6666666666666665, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 642.14, "completions/max_terminated_length": 570.34, "completions/mean_length": 420.1125, "completions/mean_terminated_length": 398.3580334472656, "completions/min_length": 250.16, "completions/min_terminated_length": 250.16, "epoch": 0.03333333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 4.710774621900829, "learning_rate": 9.918333333333334e-07, "loss": 0.0113, "num_tokens": 461361.0, "reward": 6.184720268249512, "reward_std": 1.6460176765918733, "rewards/accuracy_reward/mean": 0.31, "rewards/accuracy_reward/std": 0.393804452419281, "rewards/chart_type_reward/mean": 0.83, "rewards/chart_type_reward/std": 0.20554168462753297, "rewards/format_reward/mean": 1.31, "rewards/format_reward/std": 0.8051172530651093, "rewards/length_think_reward/mean": 1.116875, "rewards/length_think_reward/std": 0.35811117276549337, "rewards/num_token_reward/mean": 0.645, "rewards/num_token_reward/std": 0.4071964037418365, "rewards/process_style_reward/mean": 0.7380022585391999, "rewards/process_style_reward/std": 0.20786408737301826, "rewards/table_style_reward/mean": 1.2348430597782134, "rewards/table_style_reward/std": 0.6372630500793457, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.055, "completions/max_length": 627.98, "completions/max_terminated_length": 567.12, "completions/mean_length": 419.4975, "completions/mean_terminated_length": 399.8742425537109, "completions/min_length": 271.46, "completions/min_terminated_length": 271.46, "epoch": 0.06666666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 3.617454327420967, "learning_rate": 9.835e-07, "loss": 0.033, "num_tokens": 922132.0, "reward": 7.684444198608398, "reward_std": 1.0128395134210586, "rewards/accuracy_reward/mean": 0.3675, "rewards/accuracy_reward/std": 0.39272902250289915, "rewards/chart_type_reward/mean": 0.92, "rewards/chart_type_reward/std": 0.10311741709709167, "rewards/format_reward/mean": 1.81, "rewards/format_reward/std": 0.39947535157203673, "rewards/length_think_reward/mean": 1.36, "rewards/length_think_reward/std": 0.18411283910274506, "rewards/num_token_reward/mean": 0.8975, "rewards/num_token_reward/std": 0.21008866012096405, "rewards/process_style_reward/mean": 0.8522701609134674, "rewards/process_style_reward/std": 0.2383432410657406, "rewards/table_style_reward/mean": 1.4771740126609803, "rewards/table_style_reward/std": 0.538226346373558, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.055, "completions/max_length": 663.8, "completions/max_terminated_length": 622.22, "completions/mean_length": 444.205, "completions/mean_terminated_length": 427.1011260986328, "completions/min_length": 294.06, "completions/min_terminated_length": 294.06, "epoch": 0.1, "frac_reward_zero_std": 0.0, "grad_norm": 2.298847294243072, "learning_rate": 9.751666666666666e-07, "loss": 0.0272, "num_tokens": 1393874.0, "reward": 8.196055917739868, "reward_std": 0.8822205343842506, "rewards/accuracy_reward/mean": 0.405, "rewards/accuracy_reward/std": 0.4406168383359909, "rewards/chart_type_reward/mean": 0.91, "rewards/chart_type_reward/std": 0.10892393231391907, "rewards/format_reward/mean": 1.85, "rewards/format_reward/std": 0.3395550119876862, "rewards/length_think_reward/mean": 1.48125, "rewards/length_think_reward/std": 0.04037815436720848, "rewards/num_token_reward/mean": 0.9275, "rewards/num_token_reward/std": 0.16868472278118132, "rewards/process_style_reward/mean": 1.0046203970909118, "rewards/process_style_reward/std": 0.28071307986974714, "rewards/table_style_reward/mean": 1.617685569524765, "rewards/table_style_reward/std": 0.43435441348701714, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0375, "completions/max_length": 590.62, "completions/max_terminated_length": 560.86, "completions/mean_length": 419.71, "completions/mean_terminated_length": 407.2615026855469, "completions/min_length": 300.28, "completions/min_terminated_length": 300.28, "epoch": 0.13333333333333333, "frac_reward_zero_std": 0.04, "grad_norm": 5.01614642751583, "learning_rate": 9.668333333333332e-07, "loss": 0.018, "num_tokens": 1854582.0, "reward": 8.454336786270142, "reward_std": 0.7186576825380325, "rewards/accuracy_reward/mean": 0.445, "rewards/accuracy_reward/std": 0.37084252953529356, "rewards/chart_type_reward/mean": 0.8825, "rewards/chart_type_reward/std": 0.13139761447906495, "rewards/format_reward/mean": 1.875, "rewards/format_reward/std": 0.25387449383735655, "rewards/length_think_reward/mean": 1.491875, "rewards/length_think_reward/std": 0.016481903940439226, "rewards/num_token_reward/mean": 0.935, "rewards/num_token_reward/std": 0.12912438035011292, "rewards/process_style_reward/mean": 1.1697046744823456, "rewards/process_style_reward/std": 0.3359800568223, "rewards/table_style_reward/mean": 1.6552570796012878, "rewards/table_style_reward/std": 0.4633850826323032, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 520.38, "completions/max_terminated_length": 496.48, "completions/mean_length": 381.645, "completions/mean_terminated_length": 376.8010729980469, "completions/min_length": 291.64, "completions/min_terminated_length": 291.64, "epoch": 0.16666666666666666, "frac_reward_zero_std": 0.03, "grad_norm": 3.8768085436240014, "learning_rate": 9.585e-07, "loss": 0.0049, "num_tokens": 2300628.0, "reward": 8.882754230499268, "reward_std": 0.7584551328420639, "rewards/accuracy_reward/mean": 0.6175, "rewards/accuracy_reward/std": 0.41537604093551633, "rewards/chart_type_reward/mean": 0.905, "rewards/chart_type_reward/std": 0.11220384895801544, "rewards/format_reward/mean": 1.93, "rewards/format_reward/std": 0.17845415830612182, "rewards/length_think_reward/mean": 1.49375, "rewards/length_think_reward/std": 0.01767767071723938, "rewards/num_token_reward/mean": 0.965, "rewards/num_token_reward/std": 0.08922707915306091, "rewards/process_style_reward/mean": 1.2509180808067322, "rewards/process_style_reward/std": 0.35428043991327285, "rewards/table_style_reward/mean": 1.7205862057209016, "rewards/table_style_reward/std": 0.3823927499353886, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005, "completions/max_length": 476.06, "completions/max_terminated_length": 464.94, "completions/mean_length": 359.98, "completions/mean_terminated_length": 357.9764294433594, "completions/min_length": 277.3, "completions/min_terminated_length": 277.3, "epoch": 0.2, "frac_reward_zero_std": 0.03, "grad_norm": 4.04444430042266, "learning_rate": 9.501666666666667e-07, "loss": -0.0013, "num_tokens": 2738260.0, "reward": 9.100077533721924, "reward_std": 0.5911612424254418, "rewards/accuracy_reward/mean": 0.64, "rewards/accuracy_reward/std": 0.38784352123737337, "rewards/chart_type_reward/mean": 0.9475, "rewards/chart_type_reward/std": 0.06052331507205963, "rewards/format_reward/mean": 1.96, "rewards/format_reward/std": 0.10336921453475952, "rewards/length_think_reward/mean": 1.489375, "rewards/length_think_reward/std": 0.026885675489902495, "rewards/num_token_reward/mean": 0.98, "rewards/num_token_reward/std": 0.05168460726737976, "rewards/process_style_reward/mean": 1.3212374341487885, "rewards/process_style_reward/std": 0.343754044175148, "rewards/table_style_reward/mean": 1.7619650173187256, "rewards/table_style_reward/std": 0.41162539228796957, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0025, "completions/max_length": 433.68, "completions/max_terminated_length": 429.78, "completions/mean_length": 335.8, "completions/mean_terminated_length": 334.99607177734373, "completions/min_length": 257.7, "completions/min_terminated_length": 257.7, "epoch": 0.23333333333333334, "frac_reward_zero_std": 0.07, "grad_norm": 1.820258234118004, "learning_rate": 9.418333333333332e-07, "loss": -0.0019, "num_tokens": 3165744.0, "reward": 9.284861507415771, "reward_std": 0.4822974817454815, "rewards/accuracy_reward/mean": 0.715, "rewards/accuracy_reward/std": 0.3768920677900314, "rewards/chart_type_reward/mean": 0.955, "rewards/chart_type_reward/std": 0.05690393328666687, "rewards/format_reward/mean": 1.99, "rewards/format_reward/std": 0.028284270763397217, "rewards/length_think_reward/mean": 1.49125, "rewards/length_think_reward/std": 0.01931762829422951, "rewards/num_token_reward/mean": 0.9925, "rewards/num_token_reward/std": 0.021213203072547912, "rewards/process_style_reward/mean": 1.3195704579353333, "rewards/process_style_reward/std": 0.3678068408370018, "rewards/table_style_reward/mean": 1.821540994644165, "rewards/table_style_reward/std": 0.3932980696856976, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005, "completions/max_length": 433.7, "completions/max_terminated_length": 419.2, "completions/mean_length": 331.6575, "completions/mean_terminated_length": 329.5703582763672, "completions/min_length": 261.88, "completions/min_terminated_length": 261.88, "epoch": 0.26666666666666666, "frac_reward_zero_std": 0.04, "grad_norm": 3.996548131664497, "learning_rate": 9.334999999999999e-07, "loss": 0.0016, "num_tokens": 3591319.0, "reward": 9.078651866912843, "reward_std": 0.5775595012307168, "rewards/accuracy_reward/mean": 0.6775, "rewards/accuracy_reward/std": 0.360657674074173, "rewards/chart_type_reward/mean": 0.8875, "rewards/chart_type_reward/std": 0.10328511297702789, "rewards/format_reward/mean": 1.95, "rewards/format_reward/std": 0.13165348529815674, "rewards/length_think_reward/mean": 1.48875, "rewards/length_think_reward/std": 0.021778347939252853, "rewards/num_token_reward/mean": 0.975, "rewards/num_token_reward/std": 0.06582674264907837, "rewards/process_style_reward/mean": 1.3236911845207215, "rewards/process_style_reward/std": 0.39317745611071586, "rewards/table_style_reward/mean": 1.776210721731186, "rewards/table_style_reward/std": 0.3345204618573189, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005, "completions/max_length": 460.6, "completions/max_terminated_length": 458.46, "completions/mean_length": 363.545, "completions/mean_terminated_length": 361.9766668701172, "completions/min_length": 284.26, "completions/min_terminated_length": 284.26, "epoch": 0.3, "frac_reward_zero_std": 0.08, "grad_norm": 7.628999451616129, "learning_rate": 9.251666666666666e-07, "loss": 0.01, "num_tokens": 4030253.0, "reward": 9.122887582778931, "reward_std": 0.406713061761111, "rewards/accuracy_reward/mean": 0.74, "rewards/accuracy_reward/std": 0.33945011138916015, "rewards/chart_type_reward/mean": 0.865, "rewards/chart_type_reward/std": 0.13173707962036132, "rewards/format_reward/mean": 1.99, "rewards/format_reward/std": 0.01851640224456787, "rewards/length_think_reward/mean": 1.49375, "rewards/length_think_reward/std": 0.01767766922712326, "rewards/num_token_reward/mean": 0.995, "rewards/num_token_reward/std": 0.009258201122283935, "rewards/process_style_reward/mean": 1.2878882658481599, "rewards/process_style_reward/std": 0.34421144127845765, "rewards/table_style_reward/mean": 1.751249282360077, "rewards/table_style_reward/std": 0.3605493099242449, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.84, "completions/max_terminated_length": 431.84, "completions/mean_length": 340.835, "completions/mean_terminated_length": 340.835, "completions/min_length": 259.66, "completions/min_terminated_length": 259.66, "epoch": 0.3333333333333333, "frac_reward_zero_std": 0.03, "grad_norm": 3.667965991041236, "learning_rate": 9.168333333333333e-07, "loss": 0.0008, "num_tokens": 4459843.0, "reward": 9.339048709869385, "reward_std": 0.4699262708425522, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.3332241028547287, "rewards/chart_type_reward/mean": 0.925, "rewards/chart_type_reward/std": 0.09570688426494599, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.3655757594108582, "rewards/process_style_reward/std": 0.40603267412632704, "rewards/table_style_reward/mean": 1.860972990989685, "rewards/table_style_reward/std": 0.4130638699233532, "step": 500 }, { "epoch": 0.3333333333333333, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 476.36, "eval_completions/max_terminated_length": 476.36, "eval_completions/mean_length": 301.6075, "eval_completions/mean_terminated_length": 301.6075, "eval_completions/min_length": 198.44, "eval_completions/min_terminated_length": 198.44, "eval_frac_reward_zero_std": 0.495, "eval_loss": 0.0033517335541546345, "eval_num_tokens": 4459843.0, "eval_reward": 7.484729690551758, "eval_reward_std": 0.2337803066149354, "eval_rewards/accuracy_reward/mean": 0.80875, "eval_rewards/accuracy_reward/std": 0.3655660229921341, "eval_rewards/chart_type_reward/mean": 0.6275, "eval_rewards/chart_type_reward/std": 0.46189448714256287, "eval_rewards/format_reward/mean": 1.9775, "eval_rewards/format_reward/std": 0.09993488192558289, "eval_rewards/length_think_reward/mean": 1.5, "eval_rewards/length_think_reward/std": 0.0, "eval_rewards/num_token_reward/mean": 0.98875, "eval_rewards/num_token_reward/std": 0.049967440962791446, "eval_rewards/process_style_reward/mean": 0.8432698702812195, "eval_rewards/process_style_reward/std": 0.27860497415065766, "eval_rewards/table_style_reward/mean": 0.7389598202705383, "eval_rewards/table_style_reward/std": 0.0863262665271759, "eval_runtime": 357.3984, "eval_samples_per_second": 0.56, "eval_steps_per_second": 0.02, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.68, "completions/max_terminated_length": 381.68, "completions/mean_length": 302.93, "completions/mean_terminated_length": 302.93, "completions/min_length": 236.18, "completions/min_terminated_length": 236.18, "epoch": 0.36666666666666664, "frac_reward_zero_std": 0.04, "grad_norm": 4.242303838921506, "learning_rate": 9.085e-07, "loss": -0.0004, "num_tokens": 4874635.0, "reward": 9.339553604125976, "reward_std": 0.3770983973145485, "rewards/accuracy_reward/mean": 0.74, "rewards/accuracy_reward/std": 0.3310369694232941, "rewards/chart_type_reward/mean": 0.9175, "rewards/chart_type_reward/std": 0.09259466350078582, "rewards/format_reward/mean": 1.995, "rewards/format_reward/std": 0.014142135381698609, "rewards/length_think_reward/mean": 1.49875, "rewards/length_think_reward/std": 0.003535533845424652, "rewards/num_token_reward/mean": 0.9975, "rewards/num_token_reward/std": 0.007071067690849304, "rewards/process_style_reward/mean": 1.3991042220592498, "rewards/process_style_reward/std": 0.37223585724830627, "rewards/table_style_reward/mean": 1.791699321269989, "rewards/table_style_reward/std": 0.3688546184077859, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.36, "completions/max_terminated_length": 382.36, "completions/mean_length": 313.265, "completions/mean_terminated_length": 313.265, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.4, "frac_reward_zero_std": 0.06, "grad_norm": 2.6386198633761295, "learning_rate": 9.001666666666667e-07, "loss": -0.0002, "num_tokens": 5292769.0, "reward": 9.358322076797485, "reward_std": 0.45063421681523325, "rewards/accuracy_reward/mean": 0.7475, "rewards/accuracy_reward/std": 0.32666426956653594, "rewards/chart_type_reward/mean": 0.9275, "rewards/chart_type_reward/std": 0.08190421402454376, "rewards/format_reward/mean": 1.985, "rewards/format_reward/std": 0.03265853762626648, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 0.9925, "rewards/num_token_reward/std": 0.01632926881313324, "rewards/process_style_reward/mean": 1.4022967970371247, "rewards/process_style_reward/std": 0.3624314972758293, "rewards/table_style_reward/mean": 1.8035252857208253, "rewards/table_style_reward/std": 0.3688652907311916, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0475, "completions/max_length": 502.48, "completions/max_terminated_length": 479.38, "completions/mean_length": 390.065, "completions/mean_terminated_length": 376.88559692382813, "completions/min_length": 295.92, "completions/min_terminated_length": 295.92, "epoch": 0.43333333333333335, "frac_reward_zero_std": 0.08, "grad_norm": 1.6276070180776299, "learning_rate": 8.918333333333333e-07, "loss": 0.0067, "num_tokens": 5742743.0, "reward": 8.926296434402467, "reward_std": 0.6193729147315026, "rewards/accuracy_reward/mean": 0.7025, "rewards/accuracy_reward/std": 0.33726297795772553, "rewards/chart_type_reward/mean": 0.87, "rewards/chart_type_reward/std": 0.13173707962036132, "rewards/format_reward/mean": 1.88, "rewards/format_reward/std": 0.18836578965187073, "rewards/length_think_reward/mean": 1.476875, "rewards/length_think_reward/std": 0.04876347452402115, "rewards/num_token_reward/mean": 0.9325, "rewards/num_token_reward/std": 0.09855559468269348, "rewards/process_style_reward/mean": 1.3070782232284546, "rewards/process_style_reward/std": 0.3663827758282423, "rewards/table_style_reward/mean": 1.7573432433605194, "rewards/table_style_reward/std": 0.41278132781386373, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04, "completions/max_length": 582.46, "completions/max_terminated_length": 561.88, "completions/mean_length": 474.75, "completions/mean_terminated_length": 462.6513354492188, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "epoch": 0.4666666666666667, "frac_reward_zero_std": 0.03, "grad_norm": 3.113389277396801, "learning_rate": 8.834999999999999e-07, "loss": 0.0051, "num_tokens": 6226015.0, "reward": 9.087711658477783, "reward_std": 0.6245314812660218, "rewards/accuracy_reward/mean": 0.7075, "rewards/accuracy_reward/std": 0.3379362678527832, "rewards/chart_type_reward/mean": 0.875, "rewards/chart_type_reward/std": 0.1228942984342575, "rewards/format_reward/mean": 1.915, "rewards/format_reward/std": 0.14443274736404418, "rewards/length_think_reward/mean": 1.434375, "rewards/length_think_reward/std": 0.12082115039229394, "rewards/num_token_reward/mean": 0.9425, "rewards/num_token_reward/std": 0.11464277982711792, "rewards/process_style_reward/mean": 1.4090502178668975, "rewards/process_style_reward/std": 0.4188565620034933, "rewards/table_style_reward/mean": 1.8042864727973937, "rewards/table_style_reward/std": 0.367302486859262, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08, "completions/max_length": 669.88, "completions/max_terminated_length": 636.52, "completions/mean_length": 543.3925, "completions/mean_terminated_length": 523.3314343261719, "completions/min_length": 434.66, "completions/min_terminated_length": 434.66, "epoch": 0.5, "frac_reward_zero_std": 0.07, "grad_norm": 2.705954822615204, "learning_rate": 8.751666666666666e-07, "loss": 0.0099, "num_tokens": 6735800.0, "reward": 9.080023832321167, "reward_std": 0.6411656188964844, "rewards/accuracy_reward/mean": 0.695, "rewards/accuracy_reward/std": 0.3173846417665482, "rewards/chart_type_reward/mean": 0.9, "rewards/chart_type_reward/std": 0.10690449476242066, "rewards/format_reward/mean": 1.84, "rewards/format_reward/std": 0.2742329239845276, "rewards/length_think_reward/mean": 1.445625, "rewards/length_think_reward/std": 0.09453705742955208, "rewards/num_token_reward/mean": 0.9175, "rewards/num_token_reward/std": 0.1441875296831131, "rewards/process_style_reward/mean": 1.434592843055725, "rewards/process_style_reward/std": 0.4128647920489311, "rewards/table_style_reward/mean": 1.8473060631752014, "rewards/table_style_reward/std": 0.384682634845376, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0425, "completions/max_length": 555.72, "completions/max_terminated_length": 536.3, "completions/mean_length": 459.7675, "completions/mean_terminated_length": 450.3732165527344, "completions/min_length": 376.74, "completions/min_terminated_length": 376.74, "epoch": 0.5333333333333333, "frac_reward_zero_std": 0.1, "grad_norm": 2.744641896369063, "learning_rate": 8.668333333333333e-07, "loss": 0.0041, "num_tokens": 7212935.0, "reward": 9.342824554443359, "reward_std": 0.41833467945456504, "rewards/accuracy_reward/mean": 0.7575, "rewards/accuracy_reward/std": 0.27658932030200956, "rewards/chart_type_reward/mean": 0.9025, "rewards/chart_type_reward/std": 0.10656502962112427, "rewards/format_reward/mean": 1.92, "rewards/format_reward/std": 0.11177420973777771, "rewards/length_think_reward/mean": 1.48875, "rewards/length_think_reward/std": 0.018290950953960418, "rewards/num_token_reward/mean": 0.9575, "rewards/num_token_reward/std": 0.06295817255973817, "rewards/process_style_reward/mean": 1.4902155125141143, "rewards/process_style_reward/std": 0.3751195715367794, "rewards/table_style_reward/mean": 1.8263590598106385, "rewards/table_style_reward/std": 0.3030733197927475, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02, "completions/max_length": 507.02, "completions/max_terminated_length": 496.5, "completions/mean_length": 406.635, "completions/mean_terminated_length": 400.53512084960937, "completions/min_length": 317.5, "completions/min_terminated_length": 317.5, "epoch": 0.5666666666666667, "frac_reward_zero_std": 0.07, "grad_norm": 4.8860738140378, "learning_rate": 8.585e-07, "loss": 0.0018, "num_tokens": 7668721.0, "reward": 9.412762422561645, "reward_std": 0.4385050618648529, "rewards/accuracy_reward/mean": 0.705, "rewards/accuracy_reward/std": 0.333148148059845, "rewards/chart_type_reward/mean": 0.92, "rewards/chart_type_reward/std": 0.09823348283767701, "rewards/format_reward/mean": 1.955, "rewards/format_reward/std": 0.08232370734214783, "rewards/length_think_reward/mean": 1.490625, "rewards/length_think_reward/std": 0.018341146558523178, "rewards/num_token_reward/mean": 0.9775, "rewards/num_token_reward/std": 0.04116185367107392, "rewards/process_style_reward/mean": 1.4954944217205048, "rewards/process_style_reward/std": 0.4515664022415876, "rewards/table_style_reward/mean": 1.8691429781913758, "rewards/table_style_reward/std": 0.2963829467073083, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.54, "completions/max_terminated_length": 464.54, "completions/mean_length": 374.935, "completions/mean_terminated_length": 374.935, "completions/min_length": 296.92, "completions/min_terminated_length": 296.92, "epoch": 0.6, "frac_reward_zero_std": 0.09, "grad_norm": 1.8355684342913006, "learning_rate": 8.501666666666666e-07, "loss": 0.0025, "num_tokens": 8111331.0, "reward": 9.84747314453125, "reward_std": 0.36430649772286416, "rewards/accuracy_reward/mean": 0.7225, "rewards/accuracy_reward/std": 0.2612554532289505, "rewards/chart_type_reward/mean": 0.9475, "rewards/chart_type_reward/std": 0.0391424161195755, "rewards/format_reward/mean": 1.995, "rewards/format_reward/std": 0.014142135381698609, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 0.9975, "rewards/num_token_reward/std": 0.007071067690849304, "rewards/process_style_reward/mean": 1.6478522050380706, "rewards/process_style_reward/std": 0.39793143898248673, "rewards/table_style_reward/mean": 2.0371209740638734, "rewards/table_style_reward/std": 0.3056399393081665, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0075, "completions/max_length": 482.28, "completions/max_terminated_length": 480.06, "completions/mean_length": 378.5375, "completions/mean_terminated_length": 376.3790002441406, "completions/min_length": 300.54, "completions/min_terminated_length": 300.54, "epoch": 0.6333333333333333, "frac_reward_zero_std": 0.1, "grad_norm": 1.3347305735332344, "learning_rate": 8.418333333333333e-07, "loss": 0.008, "num_tokens": 8555522.0, "reward": 9.62767692565918, "reward_std": 0.3843289668299258, "rewards/accuracy_reward/mean": 0.7425, "rewards/accuracy_reward/std": 0.31665275037288665, "rewards/chart_type_reward/mean": 0.93, "rewards/chart_type_reward/std": 0.07483314633369446, "rewards/format_reward/mean": 1.985, "rewards/format_reward/std": 0.02070196866989136, "rewards/length_think_reward/mean": 1.496875, "rewards/length_think_reward/std": 0.004580627083778381, "rewards/num_token_reward/mean": 0.9925, "rewards/num_token_reward/std": 0.01035098433494568, "rewards/process_style_reward/mean": 1.6256941604614257, "rewards/process_style_reward/std": 0.46362122789025306, "rewards/table_style_reward/mean": 1.8551077818870545, "rewards/table_style_reward/std": 0.3706050312891602, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005, "completions/max_length": 462.0, "completions/max_terminated_length": 455.9, "completions/mean_length": 378.345, "completions/mean_terminated_length": 376.565, "completions/min_length": 300.4, "completions/min_terminated_length": 300.4, "epoch": 0.6666666666666666, "frac_reward_zero_std": 0.06, "grad_norm": 2.6667420842164096, "learning_rate": 8.334999999999999e-07, "loss": 0.0022, "num_tokens": 9000848.0, "reward": 9.415374546051025, "reward_std": 0.4214027213305235, "rewards/accuracy_reward/mean": 0.7225, "rewards/accuracy_reward/std": 0.2530916023254395, "rewards/chart_type_reward/mean": 0.895, "rewards/chart_type_reward/std": 0.12104663014411926, "rewards/format_reward/mean": 1.99, "rewards/format_reward/std": 0.01851640224456787, "rewards/length_think_reward/mean": 1.49375, "rewards/length_think_reward/std": 0.011572750806808472, "rewards/num_token_reward/mean": 0.995, "rewards/num_token_reward/std": 0.009258201122283935, "rewards/process_style_reward/mean": 1.488259848356247, "rewards/process_style_reward/std": 0.3761888966709375, "rewards/table_style_reward/mean": 1.8308646750450135, "rewards/table_style_reward/std": 0.3701925078779459, "step": 1000 }, { "epoch": 0.6666666666666666, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.005, "eval_completions/max_length": 588.16, "eval_completions/max_terminated_length": 586.0, "eval_completions/mean_length": 377.64875, "eval_completions/mean_terminated_length": 375.7753918457031, "eval_completions/min_length": 247.76, "eval_completions/min_terminated_length": 247.76, "eval_frac_reward_zero_std": 0.645, "eval_loss": 0.001064616721123457, "eval_num_tokens": 9000848.0, "eval_reward": 7.492321758270264, "eval_reward_std": 0.13835911433212458, "eval_rewards/accuracy_reward/mean": 0.79625, "eval_rewards/accuracy_reward/std": 0.3533613955974579, "eval_rewards/chart_type_reward/mean": 0.61375, "eval_rewards/chart_type_reward/std": 0.46422410249710083, "eval_rewards/format_reward/mean": 1.9875, "eval_rewards/format_reward/std": 0.05197583675384521, "eval_rewards/length_think_reward/mean": 1.49625, "eval_rewards/length_think_reward/std": 0.019564387649297715, "eval_rewards/num_token_reward/mean": 0.9925, "eval_rewards/num_token_reward/std": 0.03305898606777191, "eval_rewards/process_style_reward/mean": 0.8522592496871948, "eval_rewards/process_style_reward/std": 0.25637542963027954, "eval_rewards/table_style_reward/mean": 0.7538124990463256, "eval_rewards/table_style_reward/std": 0.035973691046237946, "eval_runtime": 425.3173, "eval_samples_per_second": 0.47, "eval_steps_per_second": 0.016, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01, "completions/max_length": 489.34, "completions/max_terminated_length": 481.02, "completions/mean_length": 393.5375, "completions/mean_terminated_length": 389.2625, "completions/min_length": 313.12, "completions/min_terminated_length": 313.12, "epoch": 0.7, "frac_reward_zero_std": 0.06, "grad_norm": 2.333315545051143, "learning_rate": 8.251666666666667e-07, "loss": 0.0042, "num_tokens": 9451267.0, "reward": 9.452465152740478, "reward_std": 0.3454449198395014, "rewards/accuracy_reward/mean": 0.695, "rewards/accuracy_reward/std": 0.2591600608825684, "rewards/chart_type_reward/mean": 0.8875, "rewards/chart_type_reward/std": 0.08190421402454376, "rewards/format_reward/mean": 1.98, "rewards/format_reward/std": 0.021380898952484132, "rewards/length_think_reward/mean": 1.49125, "rewards/length_think_reward/std": 0.010938137769699097, "rewards/num_token_reward/mean": 0.99, "rewards/num_token_reward/std": 0.010690449476242066, "rewards/process_style_reward/mean": 1.5581933534145356, "rewards/process_style_reward/std": 0.46272379651665685, "rewards/table_style_reward/mean": 1.8505217921733856, "rewards/table_style_reward/std": 0.37696966528892517, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.34, "completions/max_terminated_length": 496.34, "completions/mean_length": 406.19, "completions/mean_terminated_length": 406.19, "completions/min_length": 335.7, "completions/min_terminated_length": 335.7, "epoch": 0.7333333333333333, "frac_reward_zero_std": 0.09, "grad_norm": 3.6502729861830896, "learning_rate": 8.168333333333333e-07, "loss": 0.0033, "num_tokens": 9907007.0, "reward": 9.746276054382324, "reward_std": 0.30796022541588175, "rewards/accuracy_reward/mean": 0.795, "rewards/accuracy_reward/std": 0.25512682616710664, "rewards/chart_type_reward/mean": 0.94, "rewards/chart_type_reward/std": 0.06127820014953613, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.5490337193012238, "rewards/process_style_reward/std": 0.4507550221681595, "rewards/table_style_reward/mean": 1.962242330312729, "rewards/table_style_reward/std": 0.3748661072552204, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.02, "completions/max_terminated_length": 463.02, "completions/mean_length": 380.8375, "completions/mean_terminated_length": 380.8375, "completions/min_length": 315.86, "completions/min_terminated_length": 315.86, "epoch": 0.7666666666666667, "frac_reward_zero_std": 0.08, "grad_norm": 1.6467571445573885, "learning_rate": 8.085e-07, "loss": 0.0081, "num_tokens": 10352734.0, "reward": 9.6218558883667, "reward_std": 0.378871104568243, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.27910871148109434, "rewards/chart_type_reward/mean": 0.92, "rewards/chart_type_reward/std": 0.08552359580993653, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.496875, "rewards/length_think_reward/std": 0.00883883535861969, "rewards/num_token_reward/mean": 0.995, "rewards/num_token_reward/std": 0.014142135381698609, "rewards/process_style_reward/mean": 1.5185503327846528, "rewards/process_style_reward/std": 0.408857840411365, "rewards/table_style_reward/mean": 1.9414305424690246, "rewards/table_style_reward/std": 0.4009686389937997, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015, "completions/max_length": 441.24, "completions/max_terminated_length": 433.9, "completions/mean_length": 362.795, "completions/mean_terminated_length": 357.56416687011716, "completions/min_length": 293.56, "completions/min_terminated_length": 293.56, "epoch": 0.8, "frac_reward_zero_std": 0.06, "grad_norm": 3.0683170001047517, "learning_rate": 8.001666666666667e-07, "loss": -0.0007, "num_tokens": 10790708.0, "reward": 9.64956069946289, "reward_std": 0.394139921143651, "rewards/accuracy_reward/mean": 0.72, "rewards/accuracy_reward/std": 0.3199311000108719, "rewards/chart_type_reward/mean": 0.91, "rewards/chart_type_reward/std": 0.09427463591098785, "rewards/format_reward/mean": 1.97, "rewards/format_reward/std": 0.039897301197052, "rewards/length_think_reward/mean": 1.4975, "rewards/length_think_reward/std": 0.004629100561141968, "rewards/num_token_reward/mean": 0.985, "rewards/num_token_reward/std": 0.019948650598526, "rewards/process_style_reward/mean": 1.6004662060737609, "rewards/process_style_reward/std": 0.36032520439475774, "rewards/table_style_reward/mean": 1.966594467163086, "rewards/table_style_reward/std": 0.3408663283288479, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0275, "completions/max_length": 510.16, "completions/max_terminated_length": 501.4, "completions/mean_length": 409.38, "completions/mean_terminated_length": 402.9113104248047, "completions/min_length": 333.52, "completions/min_terminated_length": 333.52, "epoch": 0.8333333333333334, "frac_reward_zero_std": 0.13, "grad_norm": 4.893266426475079, "learning_rate": 7.918333333333333e-07, "loss": -0.0018, "num_tokens": 11247124.0, "reward": 9.399707975387573, "reward_std": 0.4224707083776593, "rewards/accuracy_reward/mean": 0.705, "rewards/accuracy_reward/std": 0.3079745310544968, "rewards/chart_type_reward/mean": 0.91, "rewards/chart_type_reward/std": 0.09621404528617859, "rewards/format_reward/mean": 1.945, "rewards/format_reward/std": 0.09107224106788635, "rewards/length_think_reward/mean": 1.486875, "rewards/length_think_reward/std": 0.025552249252796172, "rewards/num_token_reward/mean": 0.9725, "rewards/num_token_reward/std": 0.045536120533943174, "rewards/process_style_reward/mean": 1.527708315849304, "rewards/process_style_reward/std": 0.4601225584745407, "rewards/table_style_reward/mean": 1.8526247000694276, "rewards/table_style_reward/std": 0.363852179646492, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0325, "completions/max_length": 504.7, "completions/max_terminated_length": 489.56, "completions/mean_length": 413.09, "completions/mean_terminated_length": 403.423857421875, "completions/min_length": 329.26, "completions/min_terminated_length": 329.26, "epoch": 0.8666666666666667, "frac_reward_zero_std": 0.08, "grad_norm": 4.064899633242494, "learning_rate": 7.834999999999999e-07, "loss": 0.0022, "num_tokens": 11706160.0, "reward": 9.2975998878479, "reward_std": 0.42783591762185097, "rewards/accuracy_reward/mean": 0.68, "rewards/accuracy_reward/std": 0.3520024484395981, "rewards/chart_type_reward/mean": 0.9075, "rewards/chart_type_reward/std": 0.10328511297702789, "rewards/format_reward/mean": 1.935, "rewards/format_reward/std": 0.09544337391853333, "rewards/length_think_reward/mean": 1.488125, "rewards/length_think_reward/std": 0.019788713902235033, "rewards/num_token_reward/mean": 0.9675, "rewards/num_token_reward/std": 0.047721686959266665, "rewards/process_style_reward/mean": 1.474567185640335, "rewards/process_style_reward/std": 0.4061433684825897, "rewards/table_style_reward/mean": 1.8449076747894286, "rewards/table_style_reward/std": 0.3341277042776346, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 530.7, "completions/max_terminated_length": 514.46, "completions/mean_length": 428.025, "completions/mean_terminated_length": 420.09643005371095, "completions/min_length": 341.4, "completions/min_terminated_length": 341.4, "epoch": 0.9, "frac_reward_zero_std": 0.07, "grad_norm": 2.5562933049105356, "learning_rate": 7.751666666666666e-07, "loss": 0.0079, "num_tokens": 12170214.0, "reward": 9.505103206634521, "reward_std": 0.33932688400149347, "rewards/accuracy_reward/mean": 0.7425, "rewards/accuracy_reward/std": 0.28146197378635407, "rewards/chart_type_reward/mean": 0.895, "rewards/chart_type_reward/std": 0.11616269588470458, "rewards/format_reward/mean": 1.945, "rewards/format_reward/std": 0.1008401095867157, "rewards/length_think_reward/mean": 1.495, "rewards/length_think_reward/std": 0.007967560291290284, "rewards/num_token_reward/mean": 0.9725, "rewards/num_token_reward/std": 0.05042005479335785, "rewards/process_style_reward/mean": 1.5487450003623962, "rewards/process_style_reward/std": 0.460056491792202, "rewards/table_style_reward/mean": 1.9063581895828248, "rewards/table_style_reward/std": 0.34742515232414006, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0075, "completions/max_length": 482.48, "completions/max_terminated_length": 480.16, "completions/mean_length": 397.7925, "completions/mean_terminated_length": 396.51416748046876, "completions/min_length": 326.3, "completions/min_terminated_length": 326.3, "epoch": 0.9333333333333333, "frac_reward_zero_std": 0.14, "grad_norm": 2.5170689556650028, "learning_rate": 7.668333333333333e-07, "loss": 0.0022, "num_tokens": 12621947.0, "reward": 9.625234622955322, "reward_std": 0.38073745464906095, "rewards/accuracy_reward/mean": 0.775, "rewards/accuracy_reward/std": 0.24745278298854828, "rewards/chart_type_reward/mean": 0.9275, "rewards/chart_type_reward/std": 0.08190421402454376, "rewards/format_reward/mean": 1.975, "rewards/format_reward/std": 0.0609428083896637, "rewards/length_think_reward/mean": 1.499375, "rewards/length_think_reward/std": 0.001767766922712326, "rewards/num_token_reward/mean": 0.9875, "rewards/num_token_reward/std": 0.03047140419483185, "rewards/process_style_reward/mean": 1.5226418220996856, "rewards/process_style_reward/std": 0.409429362565279, "rewards/table_style_reward/mean": 1.9382178807258605, "rewards/table_style_reward/std": 0.3078057858347893, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01, "completions/max_length": 460.66, "completions/max_terminated_length": 455.88, "completions/mean_length": 374.6425, "completions/mean_terminated_length": 372.0279779052734, "completions/min_length": 303.96, "completions/min_terminated_length": 303.96, "epoch": 0.9666666666666667, "frac_reward_zero_std": 0.11, "grad_norm": 3.2253024096985468, "learning_rate": 7.584999999999999e-07, "loss": 0.0018, "num_tokens": 13064872.0, "reward": 9.732238264083863, "reward_std": 0.35529854300431907, "rewards/accuracy_reward/mean": 0.71, "rewards/accuracy_reward/std": 0.2705294406414032, "rewards/chart_type_reward/mean": 0.945, "rewards/chart_type_reward/std": 0.06271044850349426, "rewards/format_reward/mean": 1.97, "rewards/format_reward/std": 0.07508494377136231, "rewards/length_think_reward/mean": 1.495, "rewards/length_think_reward/std": 0.014142136126756667, "rewards/num_token_reward/mean": 0.985, "rewards/num_token_reward/std": 0.037542471885681154, "rewards/process_style_reward/mean": 1.6616894614696502, "rewards/process_style_reward/std": 0.4416278822161257, "rewards/table_style_reward/mean": 1.9655487489700318, "rewards/table_style_reward/std": 0.31115186443552373, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01, "completions/max_length": 430.54, "completions/max_terminated_length": 430.2, "completions/mean_length": 355.42, "completions/mean_terminated_length": 352.83666748046875, "completions/min_length": 287.88, "completions/min_terminated_length": 287.88, "epoch": 1.0, "frac_reward_zero_std": 0.1, "grad_norm": 2.660840181002644, "learning_rate": 7.501666666666666e-07, "loss": 0.0016, "num_tokens": 13500408.0, "reward": 9.479499158859253, "reward_std": 0.3713982145488262, "rewards/accuracy_reward/mean": 0.76, "rewards/accuracy_reward/std": 0.2704376995563507, "rewards/chart_type_reward/mean": 0.905, "rewards/chart_type_reward/std": 0.10294564783573151, "rewards/format_reward/mean": 1.98, "rewards/format_reward/std": 0.03703280448913574, "rewards/length_think_reward/mean": 1.4975, "rewards/length_think_reward/std": 0.007071067690849304, "rewards/num_token_reward/mean": 0.99, "rewards/num_token_reward/std": 0.01851640224456787, "rewards/process_style_reward/mean": 1.5098418951034547, "rewards/process_style_reward/std": 0.43727574720978735, "rewards/table_style_reward/mean": 1.8371572828292846, "rewards/table_style_reward/std": 0.34460590325295926, "step": 1500 }, { "epoch": 1.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 529.84, "eval_completions/max_terminated_length": 529.84, "eval_completions/mean_length": 342.17125, "eval_completions/mean_terminated_length": 342.17125, "eval_completions/min_length": 239.0, "eval_completions/min_terminated_length": 239.0, "eval_frac_reward_zero_std": 0.655, "eval_loss": 0.0062618558295071125, "eval_num_tokens": 13500408.0, "eval_reward": 7.520802612304688, "eval_reward_std": 0.08726703974418343, "eval_rewards/accuracy_reward/mean": 0.795, "eval_rewards/accuracy_reward/std": 0.36195871770381927, "eval_rewards/chart_type_reward/mean": 0.605, "eval_rewards/chart_type_reward/std": 0.4631432008743286, "eval_rewards/format_reward/mean": 2.0, "eval_rewards/format_reward/std": 0.0, "eval_rewards/length_think_reward/mean": 1.5, "eval_rewards/length_think_reward/std": 0.0, "eval_rewards/num_token_reward/mean": 1.0, "eval_rewards/num_token_reward/std": 0.0, "eval_rewards/process_style_reward/mean": 0.8638026165962219, "eval_rewards/process_style_reward/std": 0.26793761402368543, "eval_rewards/table_style_reward/mean": 0.7570000004768371, "eval_rewards/table_style_reward/std": 0.022176709175109863, "eval_runtime": 388.0338, "eval_samples_per_second": 0.515, "eval_steps_per_second": 0.018, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01, "completions/max_length": 406.2, "completions/max_terminated_length": 398.56, "completions/mean_length": 341.1325, "completions/mean_terminated_length": 337.2075, "completions/min_length": 283.22, "completions/min_terminated_length": 283.22, "epoch": 1.0333333333333334, "frac_reward_zero_std": 0.1, "grad_norm": 3.8348164536534792, "learning_rate": 7.418333333333333e-07, "loss": -0.002, "num_tokens": 13929829.0, "reward": 9.53626503944397, "reward_std": 0.3728026695176959, "rewards/accuracy_reward/mean": 0.7, "rewards/accuracy_reward/std": 0.3113518291711807, "rewards/chart_type_reward/mean": 0.8775, "rewards/chart_type_reward/std": 0.13535646140575408, "rewards/format_reward/mean": 1.975, "rewards/format_reward/std": 0.03552303433418274, "rewards/length_think_reward/mean": 1.495625, "rewards/length_think_reward/std": 0.0061997933685779575, "rewards/num_token_reward/mean": 0.9875, "rewards/num_token_reward/std": 0.01776151716709137, "rewards/process_style_reward/mean": 1.5947026538848876, "rewards/process_style_reward/std": 0.43727443397045135, "rewards/table_style_reward/mean": 1.905937442779541, "rewards/table_style_reward/std": 0.3321183892339468, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.86, "completions/max_terminated_length": 399.86, "completions/mean_length": 318.575, "completions/mean_terminated_length": 318.575, "completions/min_length": 249.92, "completions/min_terminated_length": 249.92, "epoch": 1.0666666666666667, "frac_reward_zero_std": 0.08, "grad_norm": 7.001967911074153, "learning_rate": 7.335e-07, "loss": -0.0028, "num_tokens": 14350287.0, "reward": 9.724173564910888, "reward_std": 0.34998391315340993, "rewards/accuracy_reward/mean": 0.8025, "rewards/accuracy_reward/std": 0.2594077849388123, "rewards/chart_type_reward/mean": 0.93, "rewards/chart_type_reward/std": 0.07483314633369446, "rewards/format_reward/mean": 1.995, "rewards/format_reward/std": 0.014142135381698609, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 0.9975, "rewards/num_token_reward/std": 0.007071067690849304, "rewards/process_style_reward/mean": 1.4950222504138946, "rewards/process_style_reward/std": 0.4426997843384743, "rewards/table_style_reward/mean": 2.004151337146759, "rewards/table_style_reward/std": 0.32994183532893656, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0025, "completions/max_length": 388.28, "completions/max_terminated_length": 388.08, "completions/mean_length": 315.705, "completions/mean_terminated_length": 314.9003576660156, "completions/min_length": 250.5, "completions/min_terminated_length": 250.5, "epoch": 1.1, "frac_reward_zero_std": 0.1, "grad_norm": 9.088691176190808, "learning_rate": 7.251666666666665e-07, "loss": -0.0016, "num_tokens": 14770033.0, "reward": 9.393507385253907, "reward_std": 0.37204572021961213, "rewards/accuracy_reward/mean": 0.6925, "rewards/accuracy_reward/std": 0.31723429918289187, "rewards/chart_type_reward/mean": 0.91, "rewards/chart_type_reward/std": 0.09621404528617859, "rewards/format_reward/mean": 1.995, "rewards/format_reward/std": 0.014142135381698609, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 0.9975, "rewards/num_token_reward/std": 0.007071067690849304, "rewards/process_style_reward/mean": 1.388883638381958, "rewards/process_style_reward/std": 0.37191372729837896, "rewards/table_style_reward/mean": 1.909623656272888, "rewards/table_style_reward/std": 0.4070011004060507, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.28, "completions/max_terminated_length": 399.28, "completions/mean_length": 331.9925, "completions/mean_terminated_length": 331.9925, "completions/min_length": 274.02, "completions/min_terminated_length": 274.02, "epoch": 1.1333333333333333, "frac_reward_zero_std": 0.09, "grad_norm": 2.910463682760087, "learning_rate": 7.168333333333333e-07, "loss": -0.0014, "num_tokens": 15195754.0, "reward": 9.631321449279785, "reward_std": 0.3087148568034172, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.22186374604701997, "rewards/chart_type_reward/mean": 0.91, "rewards/chart_type_reward/std": 0.09621404528617859, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.5510436356067658, "rewards/process_style_reward/std": 0.4377661471068859, "rewards/table_style_reward/mean": 1.9202778291702272, "rewards/table_style_reward/std": 0.3158441584557295, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.14, "completions/max_terminated_length": 403.14, "completions/mean_length": 334.7425, "completions/mean_terminated_length": 334.7425, "completions/min_length": 271.28, "completions/min_terminated_length": 271.28, "epoch": 1.1666666666666667, "frac_reward_zero_std": 0.08, "grad_norm": 2.309058895100409, "learning_rate": 7.085e-07, "loss": -0.002, "num_tokens": 15622895.0, "reward": 9.747770833969117, "reward_std": 0.3141957564931363, "rewards/accuracy_reward/mean": 0.835, "rewards/accuracy_reward/std": 0.1784460115432739, "rewards/chart_type_reward/mean": 0.93, "rewards/chart_type_reward/std": 0.07483314633369446, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.5280090761184693, "rewards/process_style_reward/std": 0.40883125707507134, "rewards/table_style_reward/mean": 1.954761769771576, "rewards/table_style_reward/std": 0.2725959676504135, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005, "completions/max_length": 411.5, "completions/max_terminated_length": 407.24, "completions/mean_length": 346.24, "completions/mean_terminated_length": 344.6991668701172, "completions/min_length": 293.46, "completions/min_terminated_length": 293.46, "epoch": 1.2, "frac_reward_zero_std": 0.15, "grad_norm": 2.902060665970371, "learning_rate": 7.001666666666667e-07, "loss": 0.0016, "num_tokens": 16054907.0, "reward": 9.750760135650635, "reward_std": 0.31334371890872714, "rewards/accuracy_reward/mean": 0.79, "rewards/accuracy_reward/std": 0.23819708824157715, "rewards/chart_type_reward/mean": 0.95, "rewards/chart_type_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 1.99, "rewards/format_reward/std": 0.01851640224456787, "rewards/length_think_reward/mean": 1.49875, "rewards/length_think_reward/std": 0.002314550280570984, "rewards/num_token_reward/mean": 0.995, "rewards/num_token_reward/std": 0.009258201122283935, "rewards/process_style_reward/mean": 1.5837038934230805, "rewards/process_style_reward/std": 0.40160174869000914, "rewards/table_style_reward/mean": 1.9433062982559204, "rewards/table_style_reward/std": 0.311205018684268, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.98, "completions/max_terminated_length": 413.98, "completions/mean_length": 346.11, "completions/mean_terminated_length": 346.11, "completions/min_length": 287.76, "completions/min_terminated_length": 287.76, "epoch": 1.2333333333333334, "frac_reward_zero_std": 0.06, "grad_norm": 3.4920562160852544, "learning_rate": 6.918333333333333e-07, "loss": 0.0001, "num_tokens": 16486455.0, "reward": 9.697437267303467, "reward_std": 0.3302338109910488, "rewards/accuracy_reward/mean": 0.775, "rewards/accuracy_reward/std": 0.26244561791419985, "rewards/chart_type_reward/mean": 0.89, "rewards/chart_type_reward/std": 0.12432654678821564, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.6095642995834352, "rewards/process_style_reward/std": 0.4244466606155038, "rewards/table_style_reward/mean": 1.922872955799103, "rewards/table_style_reward/std": 0.3481115462630987, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005, "completions/max_length": 435.06, "completions/max_terminated_length": 435.04, "completions/mean_length": 364.25, "completions/mean_terminated_length": 363.41583374023435, "completions/min_length": 306.08, "completions/min_terminated_length": 306.08, "epoch": 1.2666666666666666, "frac_reward_zero_std": 0.12, "grad_norm": 3.4554706037204777, "learning_rate": 6.835e-07, "loss": -0.0042, "num_tokens": 16925067.0, "reward": 9.758917856216431, "reward_std": 0.3286038258485496, "rewards/accuracy_reward/mean": 0.7975, "rewards/accuracy_reward/std": 0.2225467497110367, "rewards/chart_type_reward/mean": 0.8925, "rewards/chart_type_reward/std": 0.08190421402454376, "rewards/format_reward/mean": 1.99, "rewards/format_reward/std": 0.01851640224456787, "rewards/length_think_reward/mean": 1.499375, "rewards/length_think_reward/std": 0.001767766922712326, "rewards/num_token_reward/mean": 0.995, "rewards/num_token_reward/std": 0.009258201122283935, "rewards/process_style_reward/mean": 1.619047586917877, "rewards/process_style_reward/std": 0.4610636526346207, "rewards/table_style_reward/mean": 1.9654952633380889, "rewards/table_style_reward/std": 0.279680118188262, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.18, "completions/max_terminated_length": 470.18, "completions/mean_length": 388.0075, "completions/mean_terminated_length": 388.0075, "completions/min_length": 316.4, "completions/min_terminated_length": 316.4, "epoch": 1.3, "frac_reward_zero_std": 0.15, "grad_norm": 2.4363619566126182, "learning_rate": 6.751666666666667e-07, "loss": 0.0043, "num_tokens": 17373038.0, "reward": 9.906033611297607, "reward_std": 0.30710218355059626, "rewards/accuracy_reward/mean": 0.81, "rewards/accuracy_reward/std": 0.2447742748260498, "rewards/chart_type_reward/mean": 0.935, "rewards/chart_type_reward/std": 0.07340089797973633, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.6697776758670806, "rewards/process_style_reward/std": 0.4667322512716055, "rewards/table_style_reward/mean": 1.9912559032440185, "rewards/table_style_reward/std": 0.33009951261803505, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.88, "completions/max_terminated_length": 484.88, "completions/mean_length": 407.5, "completions/mean_terminated_length": 407.5, "completions/min_length": 341.34, "completions/min_terminated_length": 341.34, "epoch": 1.3333333333333333, "frac_reward_zero_std": 0.26, "grad_norm": 4.715118687872951, "learning_rate": 6.668333333333332e-07, "loss": 0.0029, "num_tokens": 17829218.0, "reward": 9.806926441192626, "reward_std": 0.23523858685046434, "rewards/accuracy_reward/mean": 0.805, "rewards/accuracy_reward/std": 0.2573123925924301, "rewards/chart_type_reward/mean": 0.965, "rewards/chart_type_reward/std": 0.0462134838104248, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.586816476583481, "rewards/process_style_reward/std": 0.42196598000824453, "rewards/table_style_reward/mean": 1.9501098704338073, "rewards/table_style_reward/std": 0.2974155292659998, "step": 2000 }, { "epoch": 1.3333333333333333, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.00125, "eval_completions/max_length": 584.64, "eval_completions/max_terminated_length": 584.32, "eval_completions/mean_length": 393.495, "eval_completions/mean_terminated_length": 393.06229736328123, "eval_completions/min_length": 277.64, "eval_completions/min_terminated_length": 277.64, "eval_frac_reward_zero_std": 0.65, "eval_loss": 0.005326578393578529, "eval_num_tokens": 17829218.0, "eval_reward": 7.547686309814453, "eval_reward_std": 0.10336805308703333, "eval_rewards/accuracy_reward/mean": 0.83125, "eval_rewards/accuracy_reward/std": 0.3227571386098862, "eval_rewards/chart_type_reward/mean": 0.61625, "eval_rewards/chart_type_reward/std": 0.46573135137557986, "eval_rewards/format_reward/mean": 1.995, "eval_rewards/format_reward/std": 0.019674774408340454, "eval_rewards/length_think_reward/mean": 1.5, "eval_rewards/length_think_reward/std": 0.0, "eval_rewards/num_token_reward/mean": 0.9975, "eval_rewards/num_token_reward/std": 0.009837387204170227, "eval_rewards/process_style_reward/mean": 0.8522800421714782, "eval_rewards/process_style_reward/std": 0.25847422659397123, "eval_rewards/table_style_reward/mean": 0.7554062509536743, "eval_rewards/table_style_reward/std": 0.032950252890586854, "eval_runtime": 422.8962, "eval_samples_per_second": 0.473, "eval_steps_per_second": 0.017, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.4, "completions/max_terminated_length": 453.4, "completions/mean_length": 380.1175, "completions/mean_terminated_length": 380.1175, "completions/min_length": 314.92, "completions/min_terminated_length": 314.92, "epoch": 1.3666666666666667, "frac_reward_zero_std": 0.14, "grad_norm": 1.3071829991089465, "learning_rate": 6.584999999999999e-07, "loss": 0.0006, "num_tokens": 18274577.0, "reward": 9.592507076263427, "reward_std": 0.2941449248045683, "rewards/accuracy_reward/mean": 0.71, "rewards/accuracy_reward/std": 0.34862515032291413, "rewards/chart_type_reward/mean": 0.95, "rewards/chart_type_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.5190071046352387, "rewards/process_style_reward/std": 0.406856027841568, "rewards/table_style_reward/mean": 1.9135000681877137, "rewards/table_style_reward/std": 0.33413831643760206, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0025, "completions/max_length": 449.9, "completions/max_terminated_length": 449.66, "completions/mean_length": 374.58, "completions/mean_terminated_length": 373.85250061035157, "completions/min_length": 307.08, "completions/min_terminated_length": 307.08, "epoch": 1.4, "frac_reward_zero_std": 0.12, "grad_norm": 17.23901734959147, "learning_rate": 6.501666666666666e-07, "loss": 0.0013, "num_tokens": 18717565.0, "reward": 9.62658073425293, "reward_std": 0.41098684968426824, "rewards/accuracy_reward/mean": 0.7425, "rewards/accuracy_reward/std": 0.3153866308927536, "rewards/chart_type_reward/mean": 0.915, "rewards/chart_type_reward/std": 0.09478179693222046, "rewards/format_reward/mean": 1.99, "rewards/format_reward/std": 0.028284270763397217, "rewards/length_think_reward/mean": 1.499375, "rewards/length_think_reward/std": 0.001767766922712326, "rewards/num_token_reward/mean": 0.995, "rewards/num_token_reward/std": 0.014142135381698609, "rewards/process_style_reward/mean": 1.5444307351112365, "rewards/process_style_reward/std": 0.40891373321413993, "rewards/table_style_reward/mean": 1.9402749633789063, "rewards/table_style_reward/std": 0.3305619989708066, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.34, "completions/max_terminated_length": 408.34, "completions/mean_length": 335.9, "completions/mean_terminated_length": 335.9, "completions/min_length": 273.7, "completions/min_terminated_length": 273.7, "epoch": 1.4333333333333333, "frac_reward_zero_std": 0.11, "grad_norm": 3.329962314867653, "learning_rate": 6.418333333333333e-07, "loss": -0.0013, "num_tokens": 19145669.0, "reward": 9.794802322387696, "reward_std": 0.2854239001870155, "rewards/accuracy_reward/mean": 0.7575, "rewards/accuracy_reward/std": 0.28735102355480197, "rewards/chart_type_reward/mean": 0.95, "rewards/chart_type_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.5570359444618225, "rewards/process_style_reward/std": 0.418922475874424, "rewards/table_style_reward/mean": 2.0302664685249328, "rewards/table_style_reward/std": 0.30267744371667504, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.74, "completions/max_terminated_length": 395.74, "completions/mean_length": 329.2375, "completions/mean_terminated_length": 329.2375, "completions/min_length": 270.2, "completions/min_terminated_length": 270.2, "epoch": 1.4666666666666668, "frac_reward_zero_std": 0.1, "grad_norm": 6.069260839474777, "learning_rate": 6.335e-07, "loss": -0.002, "num_tokens": 19570032.0, "reward": 9.73668830871582, "reward_std": 0.34456304393708703, "rewards/accuracy_reward/mean": 0.765, "rewards/accuracy_reward/std": 0.1955270314216614, "rewards/chart_type_reward/mean": 0.92, "rewards/chart_type_reward/std": 0.08552359580993653, "rewards/format_reward/mean": 1.99, "rewards/format_reward/std": 0.028284270763397217, "rewards/length_think_reward/mean": 1.49625, "rewards/length_think_reward/std": 0.010606602281332016, "rewards/num_token_reward/mean": 0.995, "rewards/num_token_reward/std": 0.014142135381698609, "rewards/process_style_reward/mean": 1.5674074435234069, "rewards/process_style_reward/std": 0.415806692391634, "rewards/table_style_reward/mean": 2.00303085565567, "rewards/table_style_reward/std": 0.2655815637484193, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.58, "completions/max_terminated_length": 425.58, "completions/mean_length": 349.7875, "completions/mean_terminated_length": 349.7875, "completions/min_length": 279.34, "completions/min_terminated_length": 279.34, "epoch": 1.5, "frac_reward_zero_std": 0.13, "grad_norm": 2.1846728325533844, "learning_rate": 6.251666666666667e-07, "loss": 0.0, "num_tokens": 20002643.0, "reward": 9.64163824081421, "reward_std": 0.32703839337453244, "rewards/accuracy_reward/mean": 0.74, "rewards/accuracy_reward/std": 0.3088252305984497, "rewards/chart_type_reward/mean": 0.93, "rewards/chart_type_reward/std": 0.07483314633369446, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.584349147081375, "rewards/process_style_reward/std": 0.3927363380789757, "rewards/table_style_reward/mean": 1.887289083003998, "rewards/table_style_reward/std": 0.3619528949260712, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 343.5925, "completions/mean_terminated_length": 343.5925, "completions/min_length": 286.94, "completions/min_terminated_length": 286.94, "epoch": 1.5333333333333332, "frac_reward_zero_std": 0.12, "grad_norm": 6.91609600549848, "learning_rate": 6.168333333333333e-07, "loss": -0.0033, "num_tokens": 20433580.0, "reward": 9.802964372634888, "reward_std": 0.2991816225461662, "rewards/accuracy_reward/mean": 0.7675, "rewards/accuracy_reward/std": 0.261347194314003, "rewards/chart_type_reward/mean": 0.94, "rewards/chart_type_reward/std": 0.06414269685745239, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.6691261100769044, "rewards/process_style_reward/std": 0.37101606719195845, "rewards/table_style_reward/mean": 1.9263382363319397, "rewards/table_style_reward/std": 0.32372167307883504, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01, "completions/max_length": 431.12, "completions/max_terminated_length": 424.64, "completions/mean_length": 358.315, "completions/mean_terminated_length": 354.0025, "completions/min_length": 294.78, "completions/min_terminated_length": 294.78, "epoch": 1.5666666666666667, "frac_reward_zero_std": 0.24, "grad_norm": 0.0, "learning_rate": 6.085e-07, "loss": 0.0007, "num_tokens": 20870814.0, "reward": 9.584474143981934, "reward_std": 0.24904431821312756, "rewards/accuracy_reward/mean": 0.705, "rewards/accuracy_reward/std": 0.27204171717166903, "rewards/chart_type_reward/mean": 0.94, "rewards/chart_type_reward/std": 0.06414269685745239, "rewards/format_reward/mean": 1.98, "rewards/format_reward/std": 0.021380898952484132, "rewards/length_think_reward/mean": 1.499375, "rewards/length_think_reward/std": 0.001767766922712326, "rewards/num_token_reward/mean": 0.99, "rewards/num_token_reward/std": 0.010690449476242066, "rewards/process_style_reward/mean": 1.5493565130233764, "rewards/process_style_reward/std": 0.4589155162498355, "rewards/table_style_reward/mean": 1.920742678642273, "rewards/table_style_reward/std": 0.3032746136933565, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.84, "completions/max_terminated_length": 391.84, "completions/mean_length": 330.47, "completions/mean_terminated_length": 330.47, "completions/min_length": 274.26, "completions/min_terminated_length": 274.26, "epoch": 1.6, "frac_reward_zero_std": 0.18, "grad_norm": 2.026677468691953, "learning_rate": 6.001666666666666e-07, "loss": -0.0019, "num_tokens": 21297010.0, "reward": 9.688363914489747, "reward_std": 0.27519391929730774, "rewards/accuracy_reward/mean": 0.76, "rewards/accuracy_reward/std": 0.2853331530094147, "rewards/chart_type_reward/mean": 0.94, "rewards/chart_type_reward/std": 0.06414269685745239, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.6027523398399353, "rewards/process_style_reward/std": 0.3893206799030304, "rewards/table_style_reward/mean": 1.8856116580963134, "rewards/table_style_reward/std": 0.3420239106938243, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.06, "completions/max_terminated_length": 380.06, "completions/mean_length": 313.7925, "completions/mean_terminated_length": 313.7925, "completions/min_length": 256.48, "completions/min_terminated_length": 256.48, "epoch": 1.6333333333333333, "frac_reward_zero_std": 0.13, "grad_norm": 5.555080099940961, "learning_rate": 5.918333333333333e-07, "loss": -0.0004, "num_tokens": 21715431.0, "reward": 9.615154209136962, "reward_std": 0.2913367236009799, "rewards/accuracy_reward/mean": 0.765, "rewards/accuracy_reward/std": 0.2847459638118744, "rewards/chart_type_reward/mean": 0.92, "rewards/chart_type_reward/std": 0.08552359580993653, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.5297248589992523, "rewards/process_style_reward/std": 0.3912577797472477, "rewards/table_style_reward/mean": 1.9004293370246887, "rewards/table_style_reward/std": 0.3435662076622248, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.04, "completions/max_terminated_length": 381.04, "completions/mean_length": 315.7675, "completions/mean_terminated_length": 315.7675, "completions/min_length": 256.08, "completions/min_terminated_length": 256.08, "epoch": 1.6666666666666665, "frac_reward_zero_std": 0.15, "grad_norm": 15.55412501444908, "learning_rate": 5.835e-07, "loss": -0.0012, "num_tokens": 22134838.0, "reward": 9.620207805633544, "reward_std": 0.3084538695216179, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.26951261222362516, "rewards/chart_type_reward/mean": 0.8775, "rewards/chart_type_reward/std": 0.1279459285736084, "rewards/format_reward/mean": 1.99, "rewards/format_reward/std": 0.01851640224456787, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 0.995, "rewards/num_token_reward/std": 0.009258201122283935, "rewards/process_style_reward/mean": 1.5722642850875854, "rewards/process_style_reward/std": 0.4096125695109367, "rewards/table_style_reward/mean": 1.9354435563087464, "rewards/table_style_reward/std": 0.25229784632101654, "step": 2500 }, { "epoch": 1.6666666666666665, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 444.2, "eval_completions/max_terminated_length": 444.2, "eval_completions/mean_length": 306.56375, "eval_completions/mean_terminated_length": 306.56375, "eval_completions/min_length": 212.04, "eval_completions/min_terminated_length": 212.04, "eval_frac_reward_zero_std": 0.69, "eval_loss": 0.0007887376705184579, "eval_num_tokens": 22134838.0, "eval_reward": 7.574729671478272, "eval_reward_std": 0.06697057218523696, "eval_rewards/accuracy_reward/mean": 0.84625, "eval_rewards/accuracy_reward/std": 0.3025382542610168, "eval_rewards/chart_type_reward/mean": 0.625, "eval_rewards/chart_type_reward/std": 0.463611079454422, "eval_rewards/format_reward/mean": 2.0, "eval_rewards/format_reward/std": 0.0, "eval_rewards/length_think_reward/mean": 1.5, "eval_rewards/length_think_reward/std": 0.0, "eval_rewards/num_token_reward/mean": 1.0, "eval_rewards/num_token_reward/std": 0.0, "eval_rewards/process_style_reward/mean": 0.8455421495437622, "eval_rewards/process_style_reward/std": 0.2721589285135269, "eval_rewards/table_style_reward/mean": 0.7579375004768372, "eval_rewards/table_style_reward/std": 0.020825568437576294, "eval_runtime": 332.6249, "eval_samples_per_second": 0.601, "eval_steps_per_second": 0.021, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.08, "completions/max_terminated_length": 370.08, "completions/mean_length": 310.395, "completions/mean_terminated_length": 310.395, "completions/min_length": 252.64, "completions/min_terminated_length": 252.64, "epoch": 1.7, "frac_reward_zero_std": 0.18, "grad_norm": 1.9862931479259052, "learning_rate": 5.751666666666667e-07, "loss": -0.0015, "num_tokens": 22551672.0, "reward": 9.761021976470948, "reward_std": 0.2495887253805995, "rewards/accuracy_reward/mean": 0.715, "rewards/accuracy_reward/std": 0.2910652804374695, "rewards/chart_type_reward/mean": 0.92, "rewards/chart_type_reward/std": 0.08552359580993653, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.6147488832473755, "rewards/process_style_reward/std": 0.4376735435426235, "rewards/table_style_reward/mean": 2.0112730717658995, "rewards/table_style_reward/std": 0.25577211238443853, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.76, "completions/max_terminated_length": 378.76, "completions/mean_length": 305.165, "completions/mean_terminated_length": 305.165, "completions/min_length": 235.9, "completions/min_terminated_length": 235.9, "epoch": 1.7333333333333334, "frac_reward_zero_std": 0.18, "grad_norm": 3.8761139220743552, "learning_rate": 5.668333333333333e-07, "loss": -0.0004, "num_tokens": 22966918.0, "reward": 9.719212608337402, "reward_std": 0.2860183835402131, "rewards/accuracy_reward/mean": 0.72, "rewards/accuracy_reward/std": 0.27398112654685974, "rewards/chart_type_reward/mean": 0.875, "rewards/chart_type_reward/std": 0.09478179693222046, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.5869954061508178, "rewards/process_style_reward/std": 0.3870758730173111, "rewards/table_style_reward/mean": 2.037217149734497, "rewards/table_style_reward/std": 0.29504469502717257, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.16, "completions/max_terminated_length": 404.16, "completions/mean_length": 327.3175, "completions/mean_terminated_length": 327.3175, "completions/min_length": 258.78, "completions/min_terminated_length": 258.78, "epoch": 1.7666666666666666, "frac_reward_zero_std": 0.12, "grad_norm": 4.5948173272275, "learning_rate": 5.584999999999999e-07, "loss": -0.0004, "num_tokens": 23391041.0, "reward": 9.734212398529053, "reward_std": 0.3367015665024519, "rewards/accuracy_reward/mean": 0.7325, "rewards/accuracy_reward/std": 0.3094827342033386, "rewards/chart_type_reward/mean": 0.9, "rewards/chart_type_reward/std": 0.10690449476242066, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.649822280406952, "rewards/process_style_reward/std": 0.43425638109445575, "rewards/table_style_reward/mean": 1.9518901491165161, "rewards/table_style_reward/std": 0.2947008777409792, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.46, "completions/max_terminated_length": 373.46, "completions/mean_length": 316.095, "completions/mean_terminated_length": 316.095, "completions/min_length": 264.2, "completions/min_terminated_length": 264.2, "epoch": 1.8, "frac_reward_zero_std": 0.12, "grad_norm": 2.8524341866405294, "learning_rate": 5.501666666666666e-07, "loss": -0.0016, "num_tokens": 23809963.0, "reward": 9.839047288894653, "reward_std": 0.2964446726441383, "rewards/accuracy_reward/mean": 0.7825, "rewards/accuracy_reward/std": 0.22414826095104218, "rewards/chart_type_reward/mean": 0.88, "rewards/chart_type_reward/std": 0.12828539371490477, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.6252618777751922, "rewards/process_style_reward/std": 0.43019559178501365, "rewards/table_style_reward/mean": 2.0512854528427122, "rewards/table_style_reward/std": 0.3453987674787641, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005, "completions/max_length": 405.08, "completions/max_terminated_length": 405.02, "completions/mean_length": 330.52, "completions/mean_terminated_length": 328.8041668701172, "completions/min_length": 260.76, "completions/min_terminated_length": 260.76, "epoch": 1.8333333333333335, "frac_reward_zero_std": 0.21, "grad_norm": 3.086957169930822, "learning_rate": 5.418333333333332e-07, "loss": 0.0012, "num_tokens": 24235339.0, "reward": 9.4709245967865, "reward_std": 0.3313306954503059, "rewards/accuracy_reward/mean": 0.6475, "rewards/accuracy_reward/std": 0.315896298289299, "rewards/chart_type_reward/mean": 0.92, "rewards/chart_type_reward/std": 0.08552359580993653, "rewards/format_reward/mean": 1.985, "rewards/format_reward/std": 0.03265853762626648, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 0.9925, "rewards/num_token_reward/std": 0.01632926881313324, "rewards/process_style_reward/mean": 1.5009058022499084, "rewards/process_style_reward/std": 0.41441123938187957, "rewards/table_style_reward/mean": 1.9250187492370605, "rewards/table_style_reward/std": 0.3126064923405647, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.7, "completions/max_terminated_length": 386.7, "completions/mean_length": 320.0475, "completions/mean_terminated_length": 320.0475, "completions/min_length": 260.54, "completions/min_terminated_length": 260.54, "epoch": 1.8666666666666667, "frac_reward_zero_std": 0.17, "grad_norm": 12.10983335214368, "learning_rate": 5.335e-07, "loss": 0.0026, "num_tokens": 24656314.0, "reward": 9.687625350952148, "reward_std": 0.2753653322113678, "rewards/accuracy_reward/mean": 0.7375, "rewards/accuracy_reward/std": 0.27269922077655795, "rewards/chart_type_reward/mean": 0.9, "rewards/chart_type_reward/std": 0.10690449476242066, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.6192821896076202, "rewards/process_style_reward/std": 0.4189461704902351, "rewards/table_style_reward/mean": 1.9308432340621948, "rewards/table_style_reward/std": 0.31184935322031376, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.46, "completions/max_terminated_length": 415.46, "completions/mean_length": 346.0825, "completions/mean_terminated_length": 346.0825, "completions/min_length": 281.58, "completions/min_terminated_length": 281.58, "epoch": 1.9, "frac_reward_zero_std": 0.14, "grad_norm": 1.9872501325277399, "learning_rate": 5.251666666666667e-07, "loss": 0.0008, "num_tokens": 25088451.0, "reward": 9.661721420288085, "reward_std": 0.27916239865124226, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.2755652844905853, "rewards/chart_type_reward/mean": 0.8975, "rewards/chart_type_reward/std": 0.11397556245326995, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.5982041406631469, "rewards/process_style_reward/std": 0.3970548979192972, "rewards/table_style_reward/mean": 1.9160172486305236, "rewards/table_style_reward/std": 0.2998810928501189, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.2, "completions/max_terminated_length": 395.2, "completions/mean_length": 320.93, "completions/mean_terminated_length": 320.93, "completions/min_length": 261.28, "completions/min_terminated_length": 261.28, "epoch": 1.9333333333333333, "frac_reward_zero_std": 0.14, "grad_norm": 2.9893443101936286, "learning_rate": 5.168333333333334e-07, "loss": 0.0022, "num_tokens": 25510639.0, "reward": 9.695433053970337, "reward_std": 0.26316976999863984, "rewards/accuracy_reward/mean": 0.78, "rewards/accuracy_reward/std": 0.30066137969493867, "rewards/chart_type_reward/mean": 0.91, "rewards/chart_type_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.5607620286941528, "rewards/process_style_reward/std": 0.42447459913790225, "rewards/table_style_reward/mean": 1.9446710109710694, "rewards/table_style_reward/std": 0.30673831250518563, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.78, "completions/max_terminated_length": 391.78, "completions/mean_length": 321.075, "completions/mean_terminated_length": 321.075, "completions/min_length": 255.9, "completions/min_terminated_length": 255.9, "epoch": 1.9666666666666668, "frac_reward_zero_std": 0.14, "grad_norm": 1.9919505127604515, "learning_rate": 5.085e-07, "loss": -0.0031, "num_tokens": 25932085.0, "reward": 9.60394895553589, "reward_std": 0.3279293935373426, "rewards/accuracy_reward/mean": 0.69, "rewards/accuracy_reward/std": 0.31488103687763214, "rewards/chart_type_reward/mean": 0.9, "rewards/chart_type_reward/std": 0.08552359580993653, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.5492809236049652, "rewards/process_style_reward/std": 0.43841719649732114, "rewards/table_style_reward/mean": 1.9646680784225463, "rewards/table_style_reward/std": 0.2879634938389063, "step": 2950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.38, "completions/max_terminated_length": 395.38, "completions/mean_length": 328.125, "completions/mean_terminated_length": 328.125, "completions/min_length": 269.92, "completions/min_terminated_length": 269.92, "epoch": 2.0, "frac_reward_zero_std": 0.15, "grad_norm": 2.558583506193219, "learning_rate": 5.001666666666666e-07, "loss": 0.0015, "num_tokens": 26356915.0, "reward": 9.858510990142822, "reward_std": 0.2589301385357976, "rewards/accuracy_reward/mean": 0.8175, "rewards/accuracy_reward/std": 0.23172805666923524, "rewards/chart_type_reward/mean": 0.8875, "rewards/chart_type_reward/std": 0.12651368021965026, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.6852520942687987, "rewards/process_style_reward/std": 0.3970717826485634, "rewards/table_style_reward/mean": 1.968258823156357, "rewards/table_style_reward/std": 0.29191130749881267, "step": 3000 }, { "epoch": 2.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 442.64, "eval_completions/max_terminated_length": 442.64, "eval_completions/mean_length": 303.75625, "eval_completions/mean_terminated_length": 303.75625, "eval_completions/min_length": 209.52, "eval_completions/min_terminated_length": 209.52, "eval_frac_reward_zero_std": 0.695, "eval_loss": 0.0015344952698796988, "eval_num_tokens": 26356915.0, "eval_reward": 7.581516437530517, "eval_reward_std": 0.062049417817033825, "eval_rewards/accuracy_reward/mean": 0.85375, "eval_rewards/accuracy_reward/std": 0.3139296269416809, "eval_rewards/chart_type_reward/mean": 0.62875, "eval_rewards/chart_type_reward/std": 0.4629242014884949, "eval_rewards/format_reward/mean": 2.0, "eval_rewards/format_reward/std": 0.0, "eval_rewards/length_think_reward/mean": 1.5, "eval_rewards/length_think_reward/std": 0.0, "eval_rewards/num_token_reward/mean": 1.0, "eval_rewards/num_token_reward/std": 0.0, "eval_rewards/process_style_reward/mean": 0.840136501789093, "eval_rewards/process_style_reward/std": 0.2791509646177292, "eval_rewards/table_style_reward/mean": 0.7588799333572388, "eval_rewards/table_style_reward/std": 0.02718144789338112, "eval_runtime": 333.1256, "eval_samples_per_second": 0.6, "eval_steps_per_second": 0.021, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.1, "completions/max_terminated_length": 375.1, "completions/mean_length": 312.9625, "completions/mean_terminated_length": 312.9625, "completions/min_length": 254.74, "completions/min_terminated_length": 254.74, "epoch": 2.033333333333333, "frac_reward_zero_std": 0.21, "grad_norm": 2.2475136992847617, "learning_rate": 4.918333333333333e-07, "loss": -0.0057, "num_tokens": 26775592.0, "reward": 9.69863618850708, "reward_std": 0.28390467911958694, "rewards/accuracy_reward/mean": 0.7375, "rewards/accuracy_reward/std": 0.2776748961210251, "rewards/chart_type_reward/mean": 0.8975, "rewards/chart_type_reward/std": 0.10656502962112427, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.6072617268562317, "rewards/process_style_reward/std": 0.3515561890602112, "rewards/table_style_reward/mean": 1.956374499797821, "rewards/table_style_reward/std": 0.3190466545522213, "step": 3050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.64, "completions/max_terminated_length": 388.64, "completions/mean_length": 331.075, "completions/mean_terminated_length": 331.075, "completions/min_length": 278.64, "completions/min_terminated_length": 278.64, "epoch": 2.066666666666667, "frac_reward_zero_std": 0.21, "grad_norm": 1.6861060018846619, "learning_rate": 4.835e-07, "loss": 0.0028, "num_tokens": 27200726.0, "reward": 9.909198265075684, "reward_std": 0.26314487379044293, "rewards/accuracy_reward/mean": 0.7575, "rewards/accuracy_reward/std": 0.24443480968475342, "rewards/chart_type_reward/mean": 0.92, "rewards/chart_type_reward/std": 0.08552359580993653, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.714371042251587, "rewards/process_style_reward/std": 0.40186877727508546, "rewards/table_style_reward/mean": 2.017327206134796, "rewards/table_style_reward/std": 0.3402495227381587, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0025, "completions/max_length": 411.52, "completions/max_terminated_length": 411.02, "completions/mean_length": 335.7075, "completions/mean_terminated_length": 335.0121435546875, "completions/min_length": 269.54, "completions/min_terminated_length": 269.54, "epoch": 2.1, "frac_reward_zero_std": 0.18, "grad_norm": 1.7582515763042907, "learning_rate": 4.7516666666666667e-07, "loss": -0.0049, "num_tokens": 27628089.0, "reward": 9.542483215332032, "reward_std": 0.29263645596802235, "rewards/accuracy_reward/mean": 0.7425, "rewards/accuracy_reward/std": 0.29206632256507875, "rewards/chart_type_reward/mean": 0.88, "rewards/chart_type_reward/std": 0.10690449476242066, "rewards/format_reward/mean": 1.995, "rewards/format_reward/std": 0.014142135381698609, "rewards/length_think_reward/mean": 1.498125, "rewards/length_think_reward/std": 0.005303300619125366, "rewards/num_token_reward/mean": 0.9975, "rewards/num_token_reward/std": 0.007071067690849304, "rewards/process_style_reward/mean": 1.4842811024188995, "rewards/process_style_reward/std": 0.40244101256132125, "rewards/table_style_reward/mean": 1.9450771474838258, "rewards/table_style_reward/std": 0.3524085796624422, "step": 3150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.78, "completions/max_terminated_length": 389.78, "completions/mean_length": 324.99, "completions/mean_terminated_length": 324.99, "completions/min_length": 267.42, "completions/min_terminated_length": 267.42, "epoch": 2.1333333333333333, "frac_reward_zero_std": 0.23, "grad_norm": 5.2003103180671655, "learning_rate": 4.668333333333333e-07, "loss": -0.0014, "num_tokens": 28050929.0, "reward": 9.795648279190063, "reward_std": 0.27396415136754515, "rewards/accuracy_reward/mean": 0.745, "rewards/accuracy_reward/std": 0.26900545120239255, "rewards/chart_type_reward/mean": 0.96, "rewards/chart_type_reward/std": 0.042761797904968264, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.6426741647720338, "rewards/process_style_reward/std": 0.4128600428253412, "rewards/table_style_reward/mean": 1.947974135875702, "rewards/table_style_reward/std": 0.27850259508937597, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0075, "completions/max_length": 378.94, "completions/max_terminated_length": 378.38, "completions/mean_length": 312.8675, "completions/mean_terminated_length": 310.1270001220703, "completions/min_length": 253.22, "completions/min_terminated_length": 253.22, "epoch": 2.1666666666666665, "frac_reward_zero_std": 0.27, "grad_norm": 8.03823522353211, "learning_rate": 4.585e-07, "loss": 0.003, "num_tokens": 28468656.0, "reward": 9.798268947601319, "reward_std": 0.23171548346057536, "rewards/accuracy_reward/mean": 0.8225, "rewards/accuracy_reward/std": 0.21219169199466706, "rewards/chart_type_reward/mean": 0.895, "rewards/chart_type_reward/std": 0.11616269588470458, "rewards/format_reward/mean": 1.985, "rewards/format_reward/std": 0.02070196866989136, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 0.9925, "rewards/num_token_reward/std": 0.01035098433494568, "rewards/process_style_reward/mean": 1.56619358420372, "rewards/process_style_reward/std": 0.4133440139889717, "rewards/table_style_reward/mean": 2.037075364589691, "rewards/table_style_reward/std": 0.29497910317033527, "step": 3250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.38, "completions/max_terminated_length": 354.38, "completions/mean_length": 292.085, "completions/mean_terminated_length": 292.085, "completions/min_length": 233.4, "completions/min_terminated_length": 233.4, "epoch": 2.2, "frac_reward_zero_std": 0.26, "grad_norm": 3.7658961156032484, "learning_rate": 4.5016666666666664e-07, "loss": -0.0009, "num_tokens": 28878814.0, "reward": 9.694805870056152, "reward_std": 0.183432172909379, "rewards/accuracy_reward/mean": 0.85, "rewards/accuracy_reward/std": 0.13963742017745973, "rewards/chart_type_reward/mean": 0.9125, "rewards/chart_type_reward/std": 0.07306143283843994, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.5138389551639557, "rewards/process_style_reward/std": 0.44110236927866936, "rewards/table_style_reward/mean": 1.9184669041633606, "rewards/table_style_reward/std": 0.3392397094517946, "step": 3300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.02, "completions/max_terminated_length": 384.02, "completions/mean_length": 319.64, "completions/mean_terminated_length": 319.64, "completions/min_length": 260.52, "completions/min_terminated_length": 260.52, "epoch": 2.2333333333333334, "frac_reward_zero_std": 0.26, "grad_norm": 2.7631543444157165, "learning_rate": 4.4183333333333335e-07, "loss": -0.0016, "num_tokens": 29299838.0, "reward": 9.804728012084961, "reward_std": 0.2311769995908253, "rewards/accuracy_reward/mean": 0.8175, "rewards/accuracy_reward/std": 0.22684662878513337, "rewards/chart_type_reward/mean": 0.91, "rewards/chart_type_reward/std": 0.09621404528617859, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.630505404472351, "rewards/process_style_reward/std": 0.40631200328469275, "rewards/table_style_reward/mean": 1.9467226195335388, "rewards/table_style_reward/std": 0.25668422447517514, "step": 3350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.78, "completions/max_terminated_length": 390.78, "completions/mean_length": 311.04, "completions/mean_terminated_length": 311.04, "completions/min_length": 244.4, "completions/min_terminated_length": 244.4, "epoch": 2.2666666666666666, "frac_reward_zero_std": 0.12, "grad_norm": 3.7039274908559348, "learning_rate": 4.3349999999999996e-07, "loss": -0.0003, "num_tokens": 29716978.0, "reward": 9.710132846832275, "reward_std": 0.27550872176885605, "rewards/accuracy_reward/mean": 0.78, "rewards/accuracy_reward/std": 0.258406742811203, "rewards/chart_type_reward/mean": 0.95, "rewards/chart_type_reward/std": 0.05345224738121033, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.5488644635677338, "rewards/process_style_reward/std": 0.4340578323602676, "rewards/table_style_reward/mean": 1.9312683844566345, "rewards/table_style_reward/std": 0.29763940557837487, "step": 3400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.52, "completions/max_terminated_length": 365.52, "completions/mean_length": 307.235, "completions/mean_terminated_length": 307.235, "completions/min_length": 259.8, "completions/min_terminated_length": 259.8, "epoch": 2.3, "frac_reward_zero_std": 0.19, "grad_norm": 3.2607533668304947, "learning_rate": 4.2516666666666667e-07, "loss": -0.0013, "num_tokens": 30133056.0, "reward": 9.773553447723389, "reward_std": 0.30064396366477014, "rewards/accuracy_reward/mean": 0.7575, "rewards/accuracy_reward/std": 0.2782620853185654, "rewards/chart_type_reward/mean": 0.9225, "rewards/chart_type_reward/std": 0.08518413066864014, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.6084586906433105, "rewards/process_style_reward/std": 0.41134398311376574, "rewards/table_style_reward/mean": 1.9850947809219361, "rewards/table_style_reward/std": 0.31916536355391145, "step": 3450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005, "completions/max_length": 375.18, "completions/max_terminated_length": 374.2, "completions/mean_length": 316.03, "completions/mean_terminated_length": 313.87, "completions/min_length": 264.38, "completions/min_terminated_length": 264.38, "epoch": 2.3333333333333335, "frac_reward_zero_std": 0.19, "grad_norm": 1.990920818414865, "learning_rate": 4.1683333333333333e-07, "loss": 0.0034, "num_tokens": 30552900.0, "reward": 9.88123031616211, "reward_std": 0.2634401721076574, "rewards/accuracy_reward/mean": 0.78, "rewards/accuracy_reward/std": 0.2654762434959412, "rewards/chart_type_reward/mean": 0.9525, "rewards/chart_type_reward/std": 0.05311278223991394, "rewards/format_reward/mean": 1.99, "rewards/format_reward/std": 0.01851640224456787, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 0.995, "rewards/num_token_reward/std": 0.009258201122283935, "rewards/process_style_reward/mean": 1.6290400648117065, "rewards/process_style_reward/std": 0.41983593456447127, "rewards/table_style_reward/mean": 2.0346901965141297, "rewards/table_style_reward/std": 0.28429963132366537, "step": 3500 }, { "epoch": 2.3333333333333335, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 434.36, "eval_completions/max_terminated_length": 434.36, "eval_completions/mean_length": 297.61625, "eval_completions/mean_terminated_length": 297.61625, "eval_completions/min_length": 204.8, "eval_completions/min_terminated_length": 204.8, "eval_frac_reward_zero_std": 0.665, "eval_loss": 0.0004025402304250747, "eval_num_tokens": 30552900.0, "eval_reward": 7.552074928283691, "eval_reward_std": 0.0732093141740188, "eval_rewards/accuracy_reward/mean": 0.855, "eval_rewards/accuracy_reward/std": 0.3025389724969864, "eval_rewards/chart_type_reward/mean": 0.6025, "eval_rewards/chart_type_reward/std": 0.4713040769100189, "eval_rewards/format_reward/mean": 2.0, "eval_rewards/format_reward/std": 0.0, "eval_rewards/length_think_reward/mean": 1.5, "eval_rewards/length_think_reward/std": 0.0, "eval_rewards/num_token_reward/mean": 1.0, "eval_rewards/num_token_reward/std": 0.0, "eval_rewards/process_style_reward/mean": 0.8350749087333679, "eval_rewards/process_style_reward/std": 0.28666666328907014, "eval_rewards/table_style_reward/mean": 0.7595000004768372, "eval_rewards/table_style_reward/std": 0.023632641285657882, "eval_runtime": 327.7189, "eval_samples_per_second": 0.61, "eval_steps_per_second": 0.021, "step": 3500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.26, "completions/max_terminated_length": 368.26, "completions/mean_length": 310.085, "completions/mean_terminated_length": 310.085, "completions/min_length": 255.86, "completions/min_terminated_length": 255.86, "epoch": 2.3666666666666667, "frac_reward_zero_std": 0.15, "grad_norm": 2.559054191104846, "learning_rate": 4.0849999999999993e-07, "loss": -0.0016, "num_tokens": 30970094.0, "reward": 9.84461862564087, "reward_std": 0.31480611886829135, "rewards/accuracy_reward/mean": 0.7875, "rewards/accuracy_reward/std": 0.24949051320552826, "rewards/chart_type_reward/mean": 0.9325, "rewards/chart_type_reward/std": 0.08047196567058564, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.6563871276378632, "rewards/process_style_reward/std": 0.47597916625440123, "rewards/table_style_reward/mean": 1.9682315516471862, "rewards/table_style_reward/std": 0.31802774131298067, "step": 3550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005, "completions/max_length": 392.88, "completions/max_terminated_length": 392.72, "completions/mean_length": 328.53, "completions/mean_terminated_length": 327.8175, "completions/min_length": 269.42, "completions/min_terminated_length": 269.42, "epoch": 2.4, "frac_reward_zero_std": 0.2, "grad_norm": 1.3441577493975225, "learning_rate": 4.0016666666666664e-07, "loss": 0.0021, "num_tokens": 31394678.0, "reward": 9.768132333755494, "reward_std": 0.2865133846178651, "rewards/accuracy_reward/mean": 0.7325, "rewards/accuracy_reward/std": 0.3207777261734009, "rewards/chart_type_reward/mean": 0.8875, "rewards/chart_type_reward/std": 0.11254331409931183, "rewards/format_reward/mean": 1.99, "rewards/format_reward/std": 0.01851640224456787, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 0.995, "rewards/num_token_reward/std": 0.009258201122283935, "rewards/process_style_reward/mean": 1.6195254147052764, "rewards/process_style_reward/std": 0.4645379837602377, "rewards/table_style_reward/mean": 2.0436069059371946, "rewards/table_style_reward/std": 0.28304173408076166, "step": 3600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.54, "completions/max_terminated_length": 357.54, "completions/mean_length": 299.01, "completions/mean_terminated_length": 299.01, "completions/min_length": 244.34, "completions/min_terminated_length": 244.34, "epoch": 2.4333333333333336, "frac_reward_zero_std": 0.21, "grad_norm": 2.849518244313829, "learning_rate": 3.918333333333333e-07, "loss": 0.0003, "num_tokens": 31807526.0, "reward": 9.800979099273682, "reward_std": 0.25777424886822703, "rewards/accuracy_reward/mean": 0.7925, "rewards/accuracy_reward/std": 0.24965977609157564, "rewards/chart_type_reward/mean": 0.89, "rewards/chart_type_reward/std": 0.11759494423866272, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.5430667841434478, "rewards/process_style_reward/std": 0.4256218123435974, "rewards/table_style_reward/mean": 2.075412368774414, "rewards/table_style_reward/std": 0.3282872153446078, "step": 3650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.08, "completions/max_terminated_length": 370.08, "completions/mean_length": 309.9025, "completions/mean_terminated_length": 309.9025, "completions/min_length": 254.94, "completions/min_terminated_length": 254.94, "epoch": 2.466666666666667, "frac_reward_zero_std": 0.22, "grad_norm": 2.1631571281627955, "learning_rate": 3.835e-07, "loss": -0.0024, "num_tokens": 32224583.0, "reward": 9.830465087890625, "reward_std": 0.23616183903533966, "rewards/accuracy_reward/mean": 0.755, "rewards/accuracy_reward/std": 0.27062118172645566, "rewards/chart_type_reward/mean": 0.925, "rewards/chart_type_reward/std": 0.08409134745597839, "rewards/format_reward/mean": 1.995, "rewards/format_reward/std": 0.014142135381698609, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 0.9975, "rewards/num_token_reward/std": 0.007071067690849304, "rewards/process_style_reward/mean": 1.6658795142173768, "rewards/process_style_reward/std": 0.4328185883164406, "rewards/table_style_reward/mean": 1.992085530757904, "rewards/table_style_reward/std": 0.30545895665884015, "step": 3700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005, "completions/max_length": 374.04, "completions/max_terminated_length": 374.02, "completions/mean_length": 316.38, "completions/mean_terminated_length": 315.13416748046876, "completions/min_length": 261.26, "completions/min_terminated_length": 261.26, "epoch": 2.5, "frac_reward_zero_std": 0.18, "grad_norm": 3.743160685463559, "learning_rate": 3.751666666666666e-07, "loss": -0.0015, "num_tokens": 32644799.0, "reward": 9.782908029556275, "reward_std": 0.2735483956709504, "rewards/accuracy_reward/mean": 0.825, "rewards/accuracy_reward/std": 0.2005802285671234, "rewards/chart_type_reward/mean": 0.91, "rewards/chart_type_reward/std": 0.07483314633369446, "rewards/format_reward/mean": 1.99, "rewards/format_reward/std": 0.01851640224456787, "rewards/length_think_reward/mean": 1.499375, "rewards/length_think_reward/std": 0.001767766922712326, "rewards/num_token_reward/mean": 0.995, "rewards/num_token_reward/std": 0.009258201122283935, "rewards/process_style_reward/mean": 1.5454747760295868, "rewards/process_style_reward/std": 0.431058616489172, "rewards/table_style_reward/mean": 2.0180582451820372, "rewards/table_style_reward/std": 0.3305081824213266, "step": 3750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 333.855, "completions/mean_terminated_length": 333.855, "completions/min_length": 273.8, "completions/min_terminated_length": 273.8, "epoch": 2.533333333333333, "frac_reward_zero_std": 0.2, "grad_norm": 5.8673809839930975, "learning_rate": 3.6683333333333333e-07, "loss": -0.0002, "num_tokens": 33072297.0, "reward": 9.738230361938477, "reward_std": 0.2538567354902625, "rewards/accuracy_reward/mean": 0.7525, "rewards/accuracy_reward/std": 0.27658367991447447, "rewards/chart_type_reward/mean": 0.9, "rewards/chart_type_reward/std": 0.10690449476242066, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.6415416550636293, "rewards/process_style_reward/std": 0.433953458070755, "rewards/table_style_reward/mean": 1.9441887497901917, "rewards/table_style_reward/std": 0.32067165344953535, "step": 3800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.32, "completions/max_terminated_length": 385.32, "completions/mean_length": 327.8975, "completions/mean_terminated_length": 327.8975, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 2.5666666666666664, "frac_reward_zero_std": 0.19, "grad_norm": 3.653425622305845, "learning_rate": 3.585e-07, "loss": -0.0016, "num_tokens": 33496748.0, "reward": 9.902756881713866, "reward_std": 0.27108980235410857, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.20225769460201262, "rewards/chart_type_reward/mean": 0.89, "rewards/chart_type_reward/std": 0.11759494423866272, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.700443229675293, "rewards/process_style_reward/std": 0.37070401668548586, "rewards/table_style_reward/mean": 1.9998136401176452, "rewards/table_style_reward/std": 0.26890223439782857, "step": 3850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.18, "completions/max_terminated_length": 405.18, "completions/mean_length": 334.3775, "completions/mean_terminated_length": 334.3775, "completions/min_length": 270.76, "completions/min_terminated_length": 270.76, "epoch": 2.6, "frac_reward_zero_std": 0.21, "grad_norm": 0.0, "learning_rate": 3.5016666666666665e-07, "loss": 0.0018, "num_tokens": 33923707.0, "reward": 9.698110904693603, "reward_std": 0.2213594539882615, "rewards/accuracy_reward/mean": 0.7375, "rewards/accuracy_reward/std": 0.27844556748867033, "rewards/chart_type_reward/mean": 0.91, "rewards/chart_type_reward/std": 0.1040399980545044, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.4740183496475219, "rewards/process_style_reward/std": 0.416292352899909, "rewards/table_style_reward/mean": 2.076592493057251, "rewards/table_style_reward/std": 0.3206081053614616, "step": 3900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.28, "completions/max_terminated_length": 381.28, "completions/mean_length": 318.2575, "completions/mean_terminated_length": 318.2575, "completions/min_length": 256.24, "completions/min_terminated_length": 256.24, "epoch": 2.6333333333333333, "frac_reward_zero_std": 0.23, "grad_norm": 0.0, "learning_rate": 3.418333333333333e-07, "loss": -0.0002, "num_tokens": 34343654.0, "reward": 9.784586238861085, "reward_std": 0.21556221422739327, "rewards/accuracy_reward/mean": 0.77, "rewards/accuracy_reward/std": 0.24662194311618804, "rewards/chart_type_reward/mean": 0.95, "rewards/chart_type_reward/std": 0.03207134842872619, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.499375, "rewards/length_think_reward/std": 0.001767766922712326, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.5625994729995727, "rewards/process_style_reward/std": 0.4333411505073309, "rewards/table_style_reward/mean": 2.002611801624298, "rewards/table_style_reward/std": 0.2887386105395853, "step": 3950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.1, "completions/max_terminated_length": 385.1, "completions/mean_length": 332.6775, "completions/mean_terminated_length": 332.6775, "completions/min_length": 280.72, "completions/min_terminated_length": 280.72, "epoch": 2.6666666666666665, "frac_reward_zero_std": 0.2, "grad_norm": 3.284695127062566, "learning_rate": 3.335e-07, "loss": -0.0007, "num_tokens": 34769621.0, "reward": 9.657586326599121, "reward_std": 0.25254386219428854, "rewards/accuracy_reward/mean": 0.73, "rewards/accuracy_reward/std": 0.30427919447422025, "rewards/chart_type_reward/mean": 0.93, "rewards/chart_type_reward/std": 0.07483314633369446, "rewards/format_reward/mean": 2.0, "rewards/format_reward/std": 0.0, "rewards/length_think_reward/mean": 1.5, "rewards/length_think_reward/std": 0.0, "rewards/num_token_reward/mean": 1.0, "rewards/num_token_reward/std": 0.0, "rewards/process_style_reward/mean": 1.5347757422924042, "rewards/process_style_reward/std": 0.3837696108222008, "rewards/table_style_reward/mean": 1.9628105711936952, "rewards/table_style_reward/std": 0.36233584862202406, "step": 4000 }, { "epoch": 2.6666666666666665, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 440.6, "eval_completions/max_terminated_length": 440.6, "eval_completions/mean_length": 307.24125, "eval_completions/mean_terminated_length": 307.24125, "eval_completions/min_length": 217.76, "eval_completions/min_terminated_length": 217.76, "eval_frac_reward_zero_std": 0.71, "eval_loss": -0.0005738374311476946, "eval_num_tokens": 34769621.0, "eval_reward": 7.5656030654907225, "eval_reward_std": 0.0531126305134967, "eval_rewards/accuracy_reward/mean": 0.8525, "eval_rewards/accuracy_reward/std": 0.2918598711490631, "eval_rewards/chart_type_reward/mean": 0.61375, "eval_rewards/chart_type_reward/std": 0.466957231760025, "eval_rewards/format_reward/mean": 2.0, "eval_rewards/format_reward/std": 0.0, "eval_rewards/length_think_reward/mean": 1.5, "eval_rewards/length_think_reward/std": 0.0, "eval_rewards/num_token_reward/mean": 1.0, "eval_rewards/num_token_reward/std": 0.0, "eval_rewards/process_style_reward/mean": 0.8417280244827271, "eval_rewards/process_style_reward/std": 0.2765642327070236, "eval_rewards/table_style_reward/mean": 0.7576250004768371, "eval_rewards/table_style_reward/std": 0.030703708827495575, "eval_runtime": 331.3654, "eval_samples_per_second": 0.604, "eval_steps_per_second": 0.021, "step": 4000 } ], "logging_steps": 50, "max_steps": 6000, "num_input_tokens_seen": 34769621, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }